import os # this is .py for store constants MODEL_INFO = ["Model", "Language Model", "Date"] TASK_INFO = ["Overall", "ER", "EU", "KIR", "TG", "Rea", "Sum"] AVG_INFO = ["Overall", "ER", "EU", "KIR", "TG", "Rea", "Sum"] DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number'] SUBMISSION_NAME = "LVBench_submission" SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/THUDM/", SUBMISSION_NAME) CSV_DIR = "./LVBench_submission/result.csv" COLUMN_NAMES = MODEL_INFO + TASK_INFO LEADERBORAD_INTRODUCTION = """# LVBench Leaderboard Welcome to the leaderboard of the LVBench! 🏆 You can prepare your submission by following the [instructions](https://github.com/THUDM/LVBench?tab=readme-ov-file#get-evaluation-results-and-submit-to-leaderboard). """ SUBMIT_INTRODUCTION = """# Submit on LVBench Benchmark Introduction """ TABLE_INTRODUCTION = """ """ LEADERBORAD_INFO = """ Recent progress in multimodal large language models has markedly enhanced the understanding of short videos (typically under one minute), and several evaluation datasets have emerged accordingly. However, these advancements fall short of meeting the demands of real-world applications such as embodied intelligence for long-term decision-making, in-depth movie reviews and discussions, and live sports commentary, all of which require comprehension of long videos spanning several hours. To address this gap, we introduce LVBench, a benchmark specifically designed for long video understanding. Our dataset comprises publicly sourced videos, including TV series, sports broadcasts, and everyday surveillance footage, and encompasses a diverse set of tasks aimed at long video comprehension and information extraction. By leveraging a combination of manual annotations and model-assisted techniques, we have created a robust video understanding question-answer dataset. LVBench is designed to challenge multimodal models to demonstrate long-term memory and extended comprehension capabilities. Our extensive evaluations of various baseline models reveal that current multimodal large language models still underperform on these demanding long video understanding tasks. Through LVBench, we aim to spur the development of more advanced models capable of tackling the complexities of long video comprehension. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{wang2024lvbench, title={LVBench: An Extreme Long Video Understanding Benchmark}, author={Weihan Wang and Zehai He and Wenyi Hong and Yean Cheng and Xiaohan Zhang and Ji Qi and Shiyu Huang and Bin Xu and Yuxiao Dong and Ming Ding and Jie Tang}, year={2024}, eprint={2406.08035}, archivePrefix={arXiv}, primaryClass={cs.CV} }"""