const generatedBibEntries = { "10.1007/978-3-319-46454-1_24": { "abstract": "\"There is considerable interest in the task of automatically generating image captions. However, evaluation is challenging. Existing automatic evaluation metrics are primarily sensitive to n-gram overlap, which is neither necessary nor sufficient for the task of simulating human judgment. We hypothesize that semantic propositional content is an important component of human caption evaluation, and propose a new automated caption evaluation metric defined over scene graphs coined SPICE. Extensive evaluations across a range of models and datasets indicate that SPICE captures human judgments over model-generated captions better than other automatic metrics (e.g., system-level correlation of 0.88 with human judgments on the MS COCO dataset, versus 0.43 for CIDEr and 0.53 for METEOR). Furthermore, SPICE can answer questions such as which caption-generator best understands colors? and can caption-generators count?\",", "address": "\"Cham\",", "author": "\"Anderson, Peter and Fernando, Basura and Johnson, Mark and Gould, Stephen\",", "booktitle": "\"Computer Vision -- ECCV 2016\",", "editor": "\"Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max\",", "isbn": "\"978-3-319-46454-1\"", "pages": "\"382--398\",", "publisher": "\"Springer International Publishing\",", "title": "\"SPICE: Semantic Propositional Image Caption Evaluation\",", "type": "InProceedings", "year": "\"2016\"," }, "10.1016/j.neunet.2021.07.019": { "address": "GBR", "author": "Frolov, Stanislav and Hinz, Tobias and Raue, Federico and Hees, J\\\"{o}rn and Dengel, Andreas", "doi": "10.1016/j.neunet.2021.07.019", "issn": "0893-6080", "issue_date": "Dec 2021", "journal": "Neural Netw.", "keywords": "Generative adversarial networks, Text-to-image synthesis", "month": "dec", "number": "C", "numpages": "23", "pages": "187\u2013209", "publisher": "Elsevier Science Ltd.", "title": "Adversarial text-to-image synthesis: A review", "type": "article", "url": "https://doi.org/10.1016/j.neunet.2021.07.019", "volume": "144", "year": "2021" }, "10.1145/1809028.1806638": { "abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.", "address": "New York, NY, USA", "author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan", "doi": "10.1145/1809028.1806638", "issn": "0362-1340", "issue_date": "June 2010", "journal": "SIGPLAN Not.", "keywords": "data-parallel programming, java, mapreduce", "month": "jun", "number": "6", "numpages": "13", "pages": "363\u2013375", "publisher": "Association for Computing Machinery", "title": "FlumeJava: easy, efficient data-parallel pipelines", "type": "article", "url": "https://doi.org/10.1145/1809028.1806638", "volume": "45", "year": "2010" }, "10.1145/3461353.3461388": { "abstract": "In recent years, deep learning technology has made breakthroughs in computer vision. After using large-scale data training, the deep neural network represented by GAN is significantly better than previous technologies in image generation, including generating more reasonable, higher-definition, more complex, and more accurate images. With the continuous development of datasets, models, and applications, the fusion of different modal information, including fusion of natural language, semantic layouts, tags, edge maps, and other different modal information, to generate images has become a new demand and challenge. There are related reviews on image generation and multimodal deep learning. However, there has not been a review dedicated to multimodal deep-learning image generation to discuss the current status, existing problems, and challenges of this task. Therefore, this review proposes a survey on multimodal deep learning image generation. It aims to provide readers with an application scenario for multimodal deep learning image generation. Also, it provides readers with new multimodal deep learning image generation technologies, the relevant datasets, evaluation metrics used, and some results comparison. Finally, this article describes some of the challenges and future topics of multimodal deep learning image generation.", "address": "New York, NY, USA", "author": "Luo, Sanbi", "booktitle": "Proceedings of the 2021 5th International Conference on Innovation in Artificial Intelligence", "doi": "10.1145/3461353.3461388", "isbn": "9781450388634", "keywords": "multimodal, machine learning, image synthesis, deep learning, computer vision", "location": "Xia men, China", "numpages": "13", "pages": "108\u2013120", "publisher": "Association for Computing Machinery", "series": "ICIAI '21", "title": "A Survey on Multimodal Deep Learning for Image Synthesis: Applications, methods, datasets, evaluation metrics, and results comparison", "type": "inproceedings", "url": "https://doi.org/10.1145/3461353.3461388", "year": "2021" }, "10.3115/1073083.1073135": { "abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations.", "address": "USA", "author": "Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing", "booktitle": "Proceedings of the 40th Annual Meeting on Association for Computational Linguistics", "doi": "10.3115/1073083.1073135", "location": "Philadelphia, Pennsylvania", "numpages": "8", "pages": "311\u2013318", "publisher": "Association for Computational Linguistics", "series": "ACL '02", "title": "BLEU: a method for automatic evaluation of machine translation", "type": "inproceedings", "url": "https://doi.org/10.3115/1073083.1073135", "year": "2002" }, "10.5555/3295222.3295408": { "address": "Red Hook, NY, USA", "author": "Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp", "isbn": "9781510860964", "location": "Long Beach, California, USA", "numpages": "12", "pages": "6629\u20136640", "publisher": "Curran Associates Inc.", "series": "NIPS'17", "title": "GANs trained by a two time-scale update rule converge to a local nash equilibrium", "type": "inproceedings", "year": "2017" }, "10081412": { "author": "Croitoru, Florinel-Alin and Hondru, Vlad and Ionescu, Radu Tudor and Shah, Mubarak", "doi": "10.1109/TPAMI.2023.3261988", "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence", "keywords": "Computational modeling;Mathematical models;Noise reduction;Data models;Computer vision;Training;Task analysis;Denoising diffusion models;deep generative modeling;diffusion models;image generation;noise conditioned score networks;score-based models", "number": "9", "pages": "10850-10869", "title": "Diffusion Models in Vision: A Survey", "type": "ARTICLE", "volume": "45", "year": "2023" }, "10123038": { "author": "Xu, Peng and Zhu, Xiatian and Clifton, David A.", "doi": "10.1109/TPAMI.2023.3275156", "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence", "keywords": "Transformers;Task analysis;Surveys;Visualization;Taxonomy;Mathematical models;Data models;Multimodal learning;transformer;introductory;taxonomy;deep learning;machine learning", "number": "10", "pages": "12113-12132", "title": "Multimodal Learning With Transformers: A Survey", "type": "ARTICLE", "volume": "45", "year": "2023" }, "10218041": { "author": "Abdulghanni, Sara Faez and Abdulmunem, Ashwan A.", "booktitle": "2023 Al-Sadiq International Conference on Communication and Information Technology (AICCIT)", "doi": "10.1109/AICCIT57614.2023.10218041", "keywords": "Measurement;Training;Surveys;Deep learning;Image synthesis;Transforms;Stability analysis;Image generation;Diffusion model;Stable diffusion;Generative models;Deep learning", "number": "", "pages": "171-175", "title": "Image Generation Conditioned on Text Using Deep Learning Models: Survey", "type": "INPROCEEDINGS", "volume": "", "year": "2023" }, "4176674": { "author": "Dolan, Brian", "booktitle": "2006 Fortieth Asilomar Conference on Signals, Systems and Computers", "doi": "10.1109/ACSSC.2006.354864", "keywords": "Mammography;Computer vision;Design automation;Humans;Medical diagnostic imaging;Breast cancer;Protocols;Biopsy;Gold;Visualization", "number": "", "pages": "821-825", "title": "Computer Aided Diagnosis in Mammography: Its Development and Early Challenges", "type": "INPROCEEDINGS", "volume": "", "year": "2006" }, "5412098": { "author": "Thung, Kim-Han and Raveendran, Paramesran", "booktitle": "2009 International Conference for Technical Postgraduates (TECHPOS)", "doi": "10.1109/TECHPOS.2009.5412098", "keywords": "Image quality;Digital images;Signal processing;Image coding;Image storage;Signal processing algorithms;Noise reduction;Video compression;PSNR;Gaussian noise", "number": "", "pages": "1-4", "title": "A survey of image quality measures", "type": "INPROCEEDINGS", "volume": "", "year": "2009" }, "5430991": { "author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe", "doi": "10.1109/LSP.2010.2045550", "journal": "IEEE Signal Processing Letters", "keywords": "Discrete cosine transforms;Image quality;Statistics;Feature extraction;Layout;Distortion measurement;Support vector machines;Testing;Machine learning algorithms;Machine learning;Anisotropy;discrete cosine transform;kurtosis;natural scene statistics;no-reference quality assessment", "number": "6", "pages": "583-586", "title": "A DCT Statistics-Based Blind Image Quality Index", "type": "ARTICLE", "volume": "17", "year": "2010" }, "5596999": { "author": "Hor\u00e9, Alain and Ziou, Djemel", "booktitle": "2010 20th International Conference on Pattern Recognition", "doi": "10.1109/ICPR.2010.579", "keywords": "PSNR;Degradation;Image quality;Additives;Transform coding;Sensitivity;Image coding;PSNR;SSIM;image quality metrics", "number": "", "pages": "2366-2369", "title": "Image Quality Metrics: PSNR vs. SSIM", "type": "INPROCEEDINGS", "volume": "", "year": "2010" }, "5995446": { "author": "Tang, Huixuan and Joshi, Neel and Kapoor, Ashish", "booktitle": "CVPR 2011", "doi": "10.1109/CVPR.2011.5995446", "keywords": "Image quality;Distortion measurement;Transform coding;Histograms;Noise;Kernel;Degradation", "number": "", "pages": "305-312", "title": "Learning a blind measure of perceptual image quality", "type": "INPROCEEDINGS", "volume": "", "year": "2011" }, "6165361": { "author": "Ye, Peng and Doermann, David", "doi": "10.1109/TIP.2012.2190086", "journal": "IEEE Transactions on Image Processing", "keywords": "Feature extraction;Training;Image quality;Visualization;Transform coding;Databases;Image coding;Gabor filter;no-reference image quality assessment (NRIQA);texture analysis;visual codebook", "number": "7", "pages": "3129-3138", "title": "No-Reference Image Quality Assessment Using Visual Codebooks", "type": "ARTICLE", "volume": "21", "year": "2012" }, "6172573": { "author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe", "doi": "10.1109/TIP.2012.2191563", "journal": "IEEE Transactions on Image Processing", "keywords": "Discrete cosine transforms;Feature extraction;Visualization;Humans;Computational modeling;Predictive models;Image quality;Discrete cosine transform (DCT);generalized Gaussian density;natural scene statistics;no-reference image quality assessment", "number": "8", "pages": "3339-3352", "title": "Blind Image Quality Assessment: A Natural Scene Statistics Approach in the DCT Domain", "type": "ARTICLE", "volume": "21", "year": "2012" }, "6190099": { "author": "Mittal, Anish and Moorthy, Anush K. and Bovik, Alan C.", "booktitle": "2011 Conference Record of the Forty Fifth Asilomar Conference on Signals, Systems and Computers (ASILOMAR)", "doi": "10.1109/ACSSC.2011.6190099", "keywords": "Humans;Image quality;Transform coding;Correlation;Databases;Measurement;Computational modeling", "number": "", "pages": "723-727", "title": "Blind/Referenceless Image Spatial Quality Evaluator", "type": "INPROCEEDINGS", "volume": "", "year": "2011" }, "6353522": { "author": "Mittal, Anish and Soundararajan, Rajiv and Bovik, Alan C.", "doi": "10.1109/LSP.2012.2227726", "journal": "IEEE Signal Processing Letters", "keywords": "Image quality;Image processing;Statistical analysis;Feature extraction;Completely blind;distortion free;image quality assessment;no reference", "number": "3", "pages": "209-212", "title": "Making a \u201cCompletely Blind\u201d Image Quality Analyzer", "type": "ARTICLE", "volume": "20", "year": "2013" }, "7084843": { "author": "Venkatanath N and Praneeth D and Maruthi Chandrasekhar Bh and Channappayya, Sumohana S. and Medasani, Swarup S.", "booktitle": "2015 Twenty First National Conference on Communications (NCC)", "doi": "10.1109/NCC.2015.7084843", "keywords": "Image quality;Noise;Databases;Feature extraction;Standards;Transform coding;Image segmentation;No reference image quality assessment;spatial activity;Perceptual quality", "number": "", "pages": "1-6", "title": "Blind image quality evaluation using perception based features", "type": "INPROCEEDINGS", "volume": "", "year": "2015" }, "8195348": { "author": "Wu, Xian and Xu, Kun and Hall, Peter", "doi": "10.23919/TST.2017.8195348", "journal": "Tsinghua Science and Technology", "keywords": "Gallium nitride;Image generation;Generators;Image resolution;Feature extraction;Training;Linear programming;image synthesis;image editing;constrained image synthesis;generative adversarial networks;image-to-image translation", "number": "6", "pages": "660-674", "title": "A survey of image synthesis and editing with generative adversarial networks", "type": "ARTICLE", "volume": "22", "year": "2017" }, "8578241": { "address": "Los Alamitos, CA, USA", "author": "T. Xu and P. Zhang and Q. Huang and H. Zhang and Z. Gan and X. Huang and X. He", "booktitle": "2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "doi": "10.1109/CVPR.2018.00143", "issn": "", "keywords": "gallium nitride;generative adversarial networks;computational modeling;image generation;generators;semantics;visualization", "month": "jun", "pages": "1316-1324", "publisher": "IEEE Computer Society", "title": "AttnGAN: Fine-Grained Text to Image Generation with Attentional Generative Adversarial Networks", "type": "INPROCEEDINGS", "url": "https://doi.ieeecomputersociety.org/10.1109/CVPR.2018.00143", "volume": "", "year": "2018" }, "9495208": { "author": "Xia, Jiazhi and Lin, Weixing and Jiang, Guang and Wang, Yunhai and Chen, Wei and Schreck, Tobias", "doi": "10.1109/MCG.2021.3098804", "journal": "IEEE Computer Graphics and Applications", "keywords": "Visualization;Shape analysis;Visual perception;Clustering algorithms;Deep learning;Splines (mathematics)", "number": "5", "pages": "79-89", "title": "Visual Clustering Factors in Scatterplots", "type": "ARTICLE", "volume": "41", "year": "2021" }, "AUTOMATIC1111_Stable_Diffusion_Web_2022": { "author": "AUTOMATIC1111", "month": "aug,", "title": "Stable Diffusion Web UI", "type": "software", "url": "https://github.com/AUTOMATIC1111/stable-diffusion-webui", "year": "2022" }, "Agrawal_2019_ICCV": { "author": "Agrawal, Harsh and Desai, Karan and Wang, Yufei and Chen, Xinlei and Jain, Rishabh and Johnson, Mark and Batra, Dhruv and Parikh, Devi and Lee, Stefan and Anderson, Peter", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "title": "nocaps: novel object captioning at scale", "type": "InProceedings", "year": "2019" }, "Antol_2015_ICCV": { "author": "Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C. Lawrence and Parikh, Devi", "booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)", "month": "December", "title": "VQA: Visual Question Answering", "type": "InProceedings", "year": "2015" }, "BahdanauCB14": { "author": "Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio", "bibsource": "dblp computer science bibliography, https://dblp.org", "biburl": "https://dblp.org/rec/journals/corr/BahdanauCB14.bib", "booktitle": "3rd International Conference on Learning Representations, {ICLR} 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings", "editor": "Yoshua Bengio and Yann LeCun", "timestamp": "Wed, 17 Jul 2019 10:40:54 +0200", "title": "Neural Machine Translation by Jointly Learning to Align and Translate", "type": "inproceedings", "url": "http://arxiv.org/abs/1409.0473", "year": "2015" }, "Chang2023": { "author": "\"Nadine Chang\",", "doi": "\"10.1184/R1/23396759.v1\"", "month": "\"6\",", "title": "\"{Bridging the Gap Between Human Vision and Computer Vision}\",", "type": "article", "url": "\"https://kilthub.cmu.edu/articles/thesis/Bridging_the_Gap_Between_Human_Vision_and_Computer_Vision/23396759\",", "year": "\"2023\"," }, "Changpinyo_2021_CVPR": { "author": "Changpinyo, Soravit and Sharma, Piyush and Ding, Nan and Soricut, Radu", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "3558-3568", "title": "Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts", "type": "InProceedings", "year": "2021" }, "Chen_2018_CVPR": { "author": "Chen, Jingwen and Chen, Jiawei and Chao, Hongyang and Yang, Ming", "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "Image Blind Denoising With Generative Adversarial Network Based Noise Modeling", "type": "InProceedings", "year": "2018" }, "Chen_2023_ICCV": { "author": "Chen, Rui and Chen, Yongwei and Jiao, Ningxin and Jia, Kui", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "pages": "22246-22256", "title": "Fantasia3D: Disentangling Geometry and Appearance for High-quality Text-to-3D Content Creation", "type": "InProceedings", "year": "2023" }, "Chen_2024_WACV": { "author": "Chen, Minghao and Laina, Iro and Vedaldi, Andrea", "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)", "month": "January", "pages": "5343-5353", "title": "Training-Free Layout Control With Cross-Attention Guidance", "type": "InProceedings", "year": "2024" }, "Cho_2023_ICCV": { "author": "Cho, Jaemin and Zala, Abhay and Bansal, Mohit", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "pages": "3043-3054", "title": "DALL-Eval: Probing the Reasoning Skills and Social Biases of Text-to-Image Generation Models", "type": "InProceedings", "year": "2023" }, "Cui_2018_CVPR": { "author": "Cui, Yin and Yang, Guandao and Veit, Andreas and Huang, Xun and Belongie, Serge", "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "Learning to Evaluate Image Captioning", "type": "InProceedings", "year": "2018" }, "Desai_2021_CVPR": { "author": "Desai, Karan and Johnson, Justin", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "11162-11173", "title": "VirTex: Learning Visual Representations From Textual Annotations", "type": "InProceedings", "year": "2021" }, "Gokhale2022BenchmarkingSR": { "author": "Tejas Gokhale and Hamid Palangi and Besmira Nushi and Vibhav Vineet and Eric Horvitz and Ece Kamar and Chitta Baral and Yezhou Yang", "journal": "ArXiv", "title": "Benchmarking Spatial Relationships in Text-to-Image Generation", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:254877055", "volume": "abs/2212.10015", "year": "2022" }, "Goyal_2017_CVPR": { "author": "Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi", "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "July", "title": "Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering", "type": "InProceedings", "year": "2017" }, "Grimal_2024_tiam": { "author": "Grimal, Paul and Le Borgne, Herv\\'e and Ferret, Olivier and Tourille, Julien", "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)", "month": "January", "pages": "2890-2899", "title": "TIAM - A Metric for Evaluating Alignment in Text-to-Image Generation", "type": "InProceedings", "year": "2024" }, "Hu_2021_ICCV": { "author": "Hu, Ronghang and Singh, Amanpreet", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "pages": "1439-1449", "title": "UniT: Multimodal Multitask Learning With a Unified Transformer", "type": "InProceedings", "year": "2021" }, "Hu_2023_ICCV": { "author": "Hu, Yushi and Liu, Benlin and Kasai, Jungo and Wang, Yizhong and Ostendorf, Mari and Krishna, Ranjay and Smith, Noah A.", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "pages": "20406-20417", "title": "TIFA: Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering", "type": "InProceedings", "year": "2023" }, "Hudson_2019_CVPR": { "author": "Hudson, Drew A. and Manning, Christopher D.", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering", "type": "InProceedings", "year": "2019" }, "Karras_2019_CVPR": { "author": "Karras, Tero and Laine, Samuli and Aila, Timo", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "A Style-Based Generator Architecture for Generative Adversarial Networks", "type": "InProceedings", "year": "2019" }, "Karras_2020_CVPR": { "author": "Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo", "booktitle": "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "Analyzing and Improving the Image Quality of StyleGAN", "type": "InProceedings", "year": "2020" }, "Kirstain2023PickaPicAO": { "author": "Yuval Kirstain and Adam Polyak and Uriel Singer and Shahbuland Matiana and Joe Penna and Omer Levy", "journal": "ArXiv", "title": "Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:258437096", "volume": "abs/2305.01569", "year": "2023" }, "Krishna2016VisualGC": { "author": "Ranjay Krishna and Yuke Zhu and Oliver Groth and Justin Johnson and Kenji Hata and Joshua Kravitz and Stephanie Chen and Yannis Kalantidis and Li-Jia Li and David A. Shamma and Michael S. Bernstein and Li Fei-Fei", "journal": "International Journal of Computer Vision", "pages": "32 - 73", "title": "Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:4492210", "volume": "123", "year": "2016" }, "Li2023BLIP2BL": { "author": "Junnan Li and Dongxu Li and Silvio Savarese and Steven C. H. Hoi", "booktitle": "International Conference on Machine Learning", "title": "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models", "type": "inproceedings", "url": "https://api.semanticscholar.org/CorpusID:256390509", "year": "2023" }, "Li_2019_ICCV": { "author": "Li, Kunpeng and Zhang, Yulun and Li, Kai and Li, Yuanyuan and Fu, Yun", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "title": "Visual Semantic Reasoning for Image-Text Matching", "type": "InProceedings", "year": "2019" }, "Li_2022_CVPR": { "author": "Li, Zhiheng and Min, Martin Renqiang and Li, Kai and Xu, Chenliang", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "18197-18207", "title": "StyleT2I: Toward Compositional and High-Fidelity Text-to-Image Synthesis", "type": "InProceedings", "year": "2022" }, "Lin_2023_CVPR": { "author": "Lin, Chen-Hsuan and Gao, Jun and Tang, Luming and Takikawa, Towaki and Zeng, Xiaohui and Huang, Xun and Kreis, Karsten and Fidler, Sanja and Liu, Ming-Yu and Lin, Tsung-Yi", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "300-309", "title": "Magic3D: High-Resolution Text-to-3D Content Creation", "type": "InProceedings", "year": "2023" }, "Liu2023VisualIT": { "author": "Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee", "journal": "ArXiv", "title": "Visual Instruction Tuning", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:258179774", "volume": "abs/2304.08485", "year": "2023" }, "Ma_2023_CVPR": { "author": "Ma, Zixian and Hong, Jerry and Gul, Mustafa Omer and Gandhi, Mona and Gao, Irena and Krishna, Ranjay", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "10910-10921", "title": "CREPE: Can Vision-Language Foundation Models Reason Compositionally?", "type": "InProceedings", "year": "2023" }, "Metzer_2023_CVPR": { "author": "Metzer, Gal and Richardson, Elad and Patashnik, Or and Giryes, Raja and Cohen-Or, Daniel", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "12663-12673", "title": "Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures", "type": "InProceedings", "year": "2023" }, "NEURIPS2019_65699726": { "author": "Zhou, Sharon and Gordon, Mitchell and Krishna, Ranjay and Narcomey, Austin and Fei-Fei, Li F and Bernstein, Michael", "booktitle": "Advances in Neural Information Processing Systems", "editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett", "pages": "", "publisher": "Curran Associates, Inc.", "title": "HYPE: A Benchmark for Human eYe Perceptual Evaluation of Generative Models", "type": "inproceedings", "url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/65699726a3c601b9f31bf04019c8593c-Paper.pdf", "volume": "32", "year": "2019" }, "NEURIPS2019_c74d97b0": { "author": "Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan", "booktitle": "Advances in Neural Information Processing Systems", "editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett", "pages": "", "publisher": "Curran Associates, Inc.", "title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks", "type": "inproceedings", "url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/c74d97b01eae257e44aa9d5bade97baf-Paper.pdf", "volume": "32", "year": "2019" }, "NIPS2011_5dd9db5e": { "author": "Ordonez, Vicente and Kulkarni, Girish and Berg, Tamara", "booktitle": "Advances in Neural Information Processing Systems", "editor": "J. Shawe-Taylor and R. Zemel and P. Bartlett and F. Pereira and K.Q. Weinberger", "pages": "", "publisher": "Curran Associates, Inc.", "title": "Im2Text: Describing Images Using 1 Million Captioned Photographs", "type": "inproceedings", "url": "https://proceedings.neurips.cc/paper_files/paper/2011/file/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf", "volume": "24", "year": "2011" }, "NIPS2016_8a3363ab": { "author": "Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi and Chen, Xi", "booktitle": "Advances in Neural Information Processing Systems", "editor": "D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett", "pages": "", "publisher": "Curran Associates, Inc.", "title": "Improved Techniques for Training GANs", "type": "inproceedings", "url": "https://proceedings.neurips.cc/paper_files/paper/2016/file/8a3363abe792db2d8761d6403605aeb7-Paper.pdf", "volume": "29", "year": "2016" }, "Otani_2023_CVPR": { "author": "Otani, Mayu and Togashi, Riku and Sawai, Yu and Ishigami, Ryosuke and Nakashima, Yuta and Rahtu, Esa and Heikkil\\\"a, Janne and Satoh, Shin{\\textquoteright}ichi", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "14277-14286", "title": "Toward Verifiable and Reproducible Human Evaluation for Text-to-Image Generation", "type": "InProceedings", "year": "2023" }, "Rombach_2022_CVPR": { "author": "Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\\\"orn", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "10684-10695", "title": "High-Resolution Image Synthesis With Latent Diffusion Models", "type": "InProceedings", "year": "2022" }, "Ruiz_2023_CVPR": { "author": "Ruiz, Nataniel and Li, Yuanzhen and Jampani, Varun and Pritch, Yael and Rubinstein, Michael and Aberman, Kfir", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "22500-22510", "title": "DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation", "type": "InProceedings", "year": "2023" }, "Sahin_2024_WACV": { "author": "Sahin, Ugur and Li, Hang and Khan, Qadeer and Cremers, Daniel and Tresp, Volker", "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)", "month": "January", "pages": "5563-5573", "title": "Enhancing Multimodal Compositional Reasoning of Visual Language Models With Generative Negative Mining", "type": "InProceedings", "year": "2024" }, "Salin_2023_ICCV": { "author": "Salin, Emmanuelle and Ayache, St\\'ephane and Favre, Benoit", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops", "month": "October", "pages": "339-352", "title": "Towards an Exhaustive Evaluation of Vision-Language Foundation Models", "type": "InProceedings", "year": "2023" }, "Schramowski_2023_CVPR": { "author": "Schramowski, Patrick and Brack, Manuel and Deiseroth, Bj\\\"orn and Kersting, Kristian", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "22522-22531", "title": "Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models", "type": "InProceedings", "year": "2023" }, "Simonyan15vgg": { "author": "\"Karen Simonyan and Andrew Zisserman\",", "booktitle": "\"International Conference on Learning Representations\",", "title": "\"Very Deep Convolutional Networks for Large-Scale Image Recognition\",", "type": "InProceedings", "year": "\"2015\"," }, "Su2020VL-BERT": { "author": "Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai", "booktitle": "International Conference on Learning Representations", "title": "VL-BERT: Pre-training of Generic Visual-Linguistic Representations", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2020" }, "Thrush_2022_CVPR": { "author": "Thrush, Tristan and Jiang, Ryan and Bartolo, Max and Singh, Amanpreet and Williams, Adina and Kiela, Douwe and Ross, Candace", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "5238-5248", "title": "Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality", "type": "InProceedings", "year": "2022" }, "Tian2022GenerativeAN": { "author": "Chunwei Tian and Xuanyu Zhang and Chun-Wei Lin and Wangmeng Zuo and Yanning Zhang", "journal": "ArXiv", "title": "Generative Adversarial Networks for Image Super-Resolution: A Survey", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:248426817", "volume": "abs/2204.13620", "year": "2022" }, "Tran_2020_ACCV": { "author": "Tran, Linh Duy and Nguyen, Son Minh and Arai, Masayuki", "booktitle": "Proceedings of the Asian Conference on Computer Vision (ACCV)", "month": "November", "title": "GAN-based Noise Model for Denoising Real Images", "type": "InProceedings", "year": "2020" }, "Vedantam_2015_CVPR": { "author": "Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi", "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "CIDEr: Consensus-Based Image Description Evaluation", "type": "InProceedings", "year": "2015" }, "Wang_2023_CVPR": { "author": "Wang, Su and Saharia, Chitwan and Montgomery, Ceslee and Pont-Tuset, Jordi and Noy, Shai and Pellegrini, Stefano and Onoe, Yasumasa and Laszlo, Sarah and Fleet, David J. and Soricut, Radu and Baldridge, Jason and Norouzi, Mohammad and Anderson, Peter and Chan, William", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "18359-18369", "title": "Imagen Editor and EditBench: Advancing and Evaluating Text-Guided Image Inpainting", "type": "InProceedings", "year": "2023" }, "Wu2023HumanPS": { "author": "Xiaoshi Wu and Yiming Hao and Keqiang Sun and Yixiong Chen and Feng Zhu and Rui Zhao and Hongsheng Li", "journal": "ArXiv", "title": "Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:259171771", "volume": "abs/2306.09341", "year": "2023" }, "Wu_2023_ICCV": { "author": "Wu, Xiaoshi and Sun, Keqiang and Zhu, Feng and Zhao, Rui and Li, Hongsheng", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)", "month": "October", "pages": "2096-2105", "title": "Human Preference Score: Better Aligning Text-to-Image Models with Human Preference", "type": "InProceedings", "year": "2023" }, "Yarom2023WhatYS": { "author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran. O. Ofek and Idan Szpektor", "journal": "ArXiv", "title": "What You See is What You Read? Improving Text-Image Alignment Evaluation", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:258740893", "volume": "abs/2305.10400", "year": "2023" }, "Zellers_2019_CVPR": { "author": "Zellers, Rowan and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "From Recognition to Cognition: Visual Commonsense Reasoning", "type": "InProceedings", "year": "2019" }, "Zeng2024IntentTunerAI": { "author": "Xingchen Zeng and Ziyao Gao and Yilin Ye and Wei Zeng", "journal": "ArXiv", "title": "IntentTuner: An Interactive Framework for Integrating Human Intents in Fine-tuning Text-to-Image Generative Models", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:267312299", "volume": "abs/2401.15559", "year": "2024" }, "Zhang2020BERTScore": { "author": "Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi", "booktitle": "International Conference on Learning Representations", "title": "BERTScore: Evaluating Text Generation with BERT", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2020" }, "Zhang2023CompressA": { "author": "Lei Zhang and Fangxun Shu and Sucheng Ren and Bingchen Zhao and Hao Jiang and Cihang Xie", "journal": "ArXiv", "title": "Compress \\& Align: Curating Image-Text Data with Human Knowledge", "type": "article", "url": "https://api.semanticscholar.org/CorpusID:266174263", "volume": "abs/2312.06726", "year": "2023" }, "Zhang_2021_CVPR": { "author": "Zhang, Pengchuan and Li, Xiujun and Hu, Xiaowei and Yang, Jianwei and Zhang, Lei and Wang, Lijuan and Choi, Yejin and Gao, Jianfeng", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "5579-5588", "title": "VinVL: Revisiting Visual Representations in Vision-Language Models", "type": "InProceedings", "year": "2021" }, "Zhou_2022_CVPR": { "author": "Zhou, Xingyi and Koltun, Vladlen and Kr\\\"ahenb\\\"uhl, Philipp", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "pages": "7571-7580", "title": "Simple Multi-Dataset Detection", "type": "InProceedings", "year": "2022" }, "Zhou_2023_CVPR": { "author": "Zhou, Yutong and Shimada, Nobutaka", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops", "month": "June", "pages": "826-842", "title": "Vision + Language Applications: A Survey", "type": "InProceedings", "year": "2023" }, "Zhu_2015_ICCV": { "author": "Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja", "booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)", "month": "December", "title": "Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books", "type": "InProceedings", "year": "2015" }, "Zitnick_2013_CVPR": { "author": "Zitnick, C. L. and Parikh, Devi", "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)", "month": "June", "title": "Bringing Semantics into Focus Using Visual Abstraction", "type": "InProceedings", "year": "2013" }, "ahmad2022new": { "author": "Ahmad, Waqar and Ali, Hazrat and Shah, Zubair and Azmat, Shoaib", "journal": "Scientific Reports", "number": "1", "pages": "9533", "publisher": "Nature Publishing Group UK London", "title": "A new generative adversarial network for medical images super resolution", "type": "article", "volume": "12", "year": "2022" }, "bai2021mifid": { "author": "Bai, Ching-Yuan and Lin, Hsuan-Tien and Raffel, Colin and Kan, Wendy Chi-wen", "booktitle": "Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \\& Data Mining", "pages": "2534--2542", "title": "On training sample memorization: Lessons from benchmarking generative modeling with a large-scale competition", "type": "inproceedings", "year": "2021" }, "balanced_vqa_v2": { "author": "Yash Goyal and Tejas Khot and Douglas Summers{-}Stay and Dhruv Batra and Devi Parikh", "booktitle": "Conference on Computer Vision and Pattern Recognition (CVPR)", "title": "Making the {V} in {VQA} Matter: Elevating the Role of Image Understanding in {V}isual {Q}uestion {A}nswering", "type": "InProceedings", "year": "2017" }, "banerjee-lavie-2005-meteor": { "address": "\"Ann Arbor, Michigan\",", "author": "\"Banerjee, Satanjeev and Lavie, Alon\",", "booktitle": "\"Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization\",", "editor": "\"Goldstein, Jade and Lavie, Alon and Lin, Chin-Yew and Voss, Clare\",", "month": "jun,", "pages": "\"65--72\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments\",", "type": "inproceedings", "url": "\"https://aclanthology.org/W05-0909\",", "year": "\"2005\"," }, "barratt2018note": { "author": "Barratt, Shane and Sharma, Rishi", "journal": "arXiv preprint arXiv:1801.01973", "title": "A note on the inception score", "type": "article", "year": "2018" }, "baryshnikov2023hypernymy": { "author": "Baryshnikov, Anton and Ryabinin, Max", "journal": "arXiv preprint arXiv:2310.09247", "title": "Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet Hierarchy", "type": "article", "year": "2023" }, "betti2023let": { "author": "Betti, Federico and Staiano, Jacopo and Baraldi, Lorenzo and Baraldi, Lorenzo and Cucchiara, Rita and Sebe, Nicu", "booktitle": "Proceedings of the 31st ACM International Conference on Multimedia", "pages": "9306--9312", "title": "Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation Evaluation", "type": "inproceedings", "year": "2023" }, "binkowski2018kid": { "author": "Bi{\\'n}kowski, Miko{\\l}aj and Sutherland, Danica J and Arbel, Michael and Gretton, Arthur", "booktitle": "International Conference on Learning Representations", "title": "Demystifying MMD GANs", "type": "inproceedings", "year": "2018" }, "borji2022pros": { "author": "Borji, Ali", "journal": "Computer Vision and Image Understanding", "pages": "103329", "publisher": "Elsevier", "title": "Pros and cons of GAN evaluation measures: New developments", "type": "article", "volume": "215", "year": "2022" }, "brock2018large": { "author": "Brock, Andrew and Donahue, Jeff and Simonyan, Karen", "journal": "arXiv preprint arXiv:1809.11096", "title": "Large scale GAN training for high fidelity natural image synthesis", "type": "article", "year": "2018" }, "brown2020language": { "author": "Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others", "journal": "Advances in neural information processing systems", "pages": "1877--1901", "title": "Language models are few-shot learners", "type": "article", "volume": "33", "year": "2020" }, "caron2021emerging": { "author": "Caron, Mathilde and Touvron, Hugo and Misra, Ishan and J{\\'e}gou, Herv{\\'e} and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand", "booktitle": "Proceedings of the IEEE/CVF international conference on computer vision", "pages": "9650--9660", "title": "Emerging properties in self-supervised vision transformers", "type": "inproceedings", "year": "2021" }, "castro2024clove": { "author": "Castro, Santiago and Ziai, Amir and Saluja, Avneesh and Yuan, Zhuoning and Mihalcea, Rada", "journal": "arXiv preprint arXiv:2402.15021", "title": "CLoVe: Encoding Compositional Language in Contrastive Vision-Language Models", "type": "article", "year": "2024" }, "chambers2010flumejava": { "abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.", "address": "New York, NY, USA", "author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan", "booktitle": "Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation", "doi": "10.1145/1806596.1806638", "isbn": "9781450300193", "keywords": "data-parallel programming, java, mapreduce", "location": "Toronto, Ontario, Canada", "numpages": "13", "pages": "363\u2013375", "publisher": "Association for Computing Machinery", "series": "PLDI '10", "title": "FlumeJava: easy, efficient data-parallel pipelines", "type": "inproceedings", "url": "https://doi.org/10.1145/1806596.1806638", "year": "2010" }, "changpinyo2022all": { "author": "Changpinyo, Soravit and Kukliansky, Doron and Szpektor, Idan and Chen, Xi and Ding, Nan and Soricut, Radu", "journal": "arXiv preprint arXiv:2205.01883", "title": "All you may need for vqa are image captions", "type": "article", "year": "2022" }, "chao:iccv2015": { "author": "Yu-Wei Chao and Zhan Wang and Yugeng He and Jiaxuan Wang and Jia Deng", "booktitle": "Proceedings of the IEEE International Conference on Computer Vision", "title": "HICO: A Benchmark for Recognizing Human-Object Interactions in Images", "type": "INPROCEEDINGS", "year": "2015" }, "che2016mode": { "author": "Che, Tong and Li, Yanran and Jacob, Athul and Bengio, Yoshua and Li, Wenjie", "booktitle": "International Conference on Learning Representations", "title": "Mode Regularized Generative Adversarial Networks", "type": "inproceedings", "year": "2016" }, "chefer2023attend": { "author": "Chefer, Hila and Alaluf, Yuval and Vinker, Yael and Wolf, Lior and Cohen-Or, Daniel", "journal": "ACM Transactions on Graphics (TOG)", "number": "4", "pages": "1--10", "publisher": "ACM New York, NY, USA", "title": "Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models", "type": "article", "volume": "42", "year": "2023" }, "chen2015microsoft": { "author": "Chen, Xinlei and Fang, Hao and Lin, Tsung-Yi and Vedantam, Ramakrishna and Gupta, Saurabh and Doll{\\'a}r, Piotr and Zitnick, C Lawrence", "journal": "arXiv preprint arXiv:1504.00325", "title": "Microsoft coco captions: Data collection and evaluation server", "type": "article", "year": "2015" }, "chen2020uniter": { "author": "Chen, Yen-Chun and Li, Linjie and Yu, Licheng and El Kholy, Ahmed and Ahmed, Faisal and Gan, Zhe and Cheng, Yu and Liu, Jingjing", "booktitle": "European conference on computer vision", "organization": "Springer", "pages": "104--120", "title": "Uniter: Universal image-text representation learning", "type": "inproceedings", "year": "2020" }, "chen2022pali": { "author": "Chen, Xi and Wang, Xiao and Changpinyo, Soravit and Piergiovanni, AJ and Padlewski, Piotr and Salz, Daniel and Goodman, Sebastian and Grycner, Adam and Mustafa, Basil and Beyer, Lucas and others", "journal": "arXiv preprint arXiv:2209.06794", "title": "Pali: A jointly-scaled multilingual language-image model", "type": "article", "year": "2022" }, "dash2017tac": { "author": "Dash, Ayushman and Gamboa, John Cristian Borges and Ahmed, Sheraz and Liwicki, Marcus and Afzal, Muhammad Zeshan", "journal": "arXiv preprint arXiv:1703.06412", "title": "Tac-gan-text conditioned auxiliary classifier generative adversarial network", "type": "article", "year": "2017" }, "dehouche2023s": { "author": "Dehouche, Nassim and Dehouche, Kullathida", "journal": "Heliyon", "number": "6", "publisher": "Elsevier", "title": "What\u2019s in a text-to-image prompt? The potential of stable diffusion in visual arts education", "type": "article", "volume": "9", "year": "2023" }, "devlin2018bert": { "author": "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina", "journal": "arXiv preprint arXiv:1810.04805", "title": "Bert: Pre-training of deep bidirectional transformers for language understanding", "type": "article", "year": "2018" }, "dinh2022tise": { "author": "Dinh, Tan M and Nguyen, Rang and Hua, Binh-Son", "booktitle": "European Conference on Computer Vision", "organization": "Springer", "pages": "594--609", "title": "TISE: Bag of metrics for text-to-image synthesis evaluation", "type": "inproceedings", "year": "2022" }, "du2007ergas": { "author": "Du, Qian and Younan, Nicholas H and King, Roger and Shah, Vijay P", "journal": "IEEE Geoscience and Remote Sensing Letters", "number": "4", "pages": "518--522", "publisher": "IEEE", "title": "On the performance evaluation of pan-sharpening techniques", "type": "article", "volume": "4", "year": "2007" }, "dunlap2023describing": { "author": "Dunlap, Lisa and Zhang, Yuhui and Wang, Xiaohan and Zhong, Ruiqi and Darrell, Trevor and Steinhardt, Jacob and Gonzalez, Joseph E and Yeung-Levy, Serena", "journal": "arXiv preprint arXiv:2312.02974", "title": "Describing Differences in Image Sets with Natural Language", "type": "article", "year": "2023" }, "faghri2018vse++": { "author": "Faghri, Fartash and Fleet, David J and Kiros, Jamie Ryan and Fidler, Sanja", "booktitle": "Proceedings of the British Machine Vision Conference ({BMVC})", "title": "VSE\\+\\+: Improving Visual-Semantic Embeddings with Hard Negatives", "type": "article", "url": "https://github.com/fartashf/vsepp", "year": "2018" }, "feng2023trainingfree": { "author": "Weixi Feng and Xuehai He and Tsu-Jui Fu and Varun Jampani and Arjun Reddy Akula and Pradyumna Narayana and Sugato Basu and Xin Eric Wang and William Yang Wang", "booktitle": "The Eleventh International Conference on Learning Representations ", "title": "Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "fu2023dreamsim": { "author": "Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip", "journal": "arXiv preprint arXiv:2306.09344", "title": "DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data", "type": "article", "year": "2023" }, "gal2022clipdirsim": { "author": "Gal, Rinon and Patashnik, Or and Maron, Haggai and Bermano, Amit H and Chechik, Gal and Cohen-Or, Daniel", "journal": "ACM Transactions on Graphics (TOG)", "number": "4", "pages": "1--13", "publisher": "ACM New York, NY, USA", "title": "StyleGAN-NADA: CLIP-guided domain adaptation of image generators", "type": "article", "volume": "41", "year": "2022" }, "gan2020large": { "author": "Gan, Zhe and Chen, Yen-Chun and Li, Linjie and Zhu, Chen and Cheng, Yu and Liu, Jingjing", "journal": "Advances in Neural Information Processing Systems", "pages": "6616--6628", "title": "Large-scale adversarial training for vision-and-language representation learning", "type": "article", "volume": "33", "year": "2020" }, "gordon2023mismatch": { "archiveprefix": "arXiv", "author": "Brian Gordon and Yonatan Bitton and Yonatan Shafir and Roopal Garg and Xi Chen and Dani Lischinski and Daniel Cohen-Or and Idan Szpektor", "eprint": "2312.03766", "primaryclass": "cs.CL", "title": "Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment", "type": "misc", "year": "2023" }, "gretton2006kernel": { "author": "Gretton, Arthur and Borgwardt, Karsten and Rasch, Malte and Sch{\\\"o}lkopf, Bernhard and Smola, Alex", "journal": "Advances in neural information processing systems", "title": "A kernel method for the two-sample-problem", "type": "article", "volume": "19", "year": "2006" }, "gu2020giqa": { "author": "Gu, Shuyang and Bao, Jianmin and Chen, Dong and Wen, Fang", "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XI 16", "organization": "Springer", "pages": "369--385", "title": "Giqa: Generated image quality assessment", "type": "inproceedings", "year": "2020" }, "gu2023automated": { "author": "Gu, Siqi", "journal": "arXiv preprint arXiv:2312.12933", "title": "Automated Testing for Text-to-Image Software", "type": "article", "year": "2023" }, "hartwig2022learning": { "abstract": "Abstract View quality measures compute scores for given views and are used to determine an optimal view in viewpoint selection tasks. Unfortunately, despite the wide adoption of these measures, they are rather based on computational quantities, such as entropy, than human preferences. To instead tailor viewpoint measures towards humans, view quality measures need to be able to capture human viewpoint preferences. Therefore, we introduce a large-scale crowdsourced data set, which contains 58k annotated viewpoints for 3220 ModelNet40 models. Based on this data, we derive a neural view quality measure abiding to human preferences. We further demonstrate that this view quality measure not only generalizes to models unseen during training, but also to unseen model categories. We are thus able to predict view qualities for single images, and directly predict human preferred viewpoints for 3D models by exploiting point-based learning technology, without requiring to generate intermediate images or sampling the view sphere. We will detail our data collection procedure, describe the data analysis and model training and will evaluate the predictive quality of our trained viewpoint measure on unseen models and categories. To our knowledge, this is the first deep learning approach to predict a view quality measure solely based on human preferences.", "author": "Hartwig, S. and Schelling, M. and Onzenoodt, C. v. and V\u00e1zquez, P.-P. and Hermosilla, P. and Ropinski, T.", "doi": "https://doi.org/10.1111/cgf.14613", "eprint": "https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.14613", "journal": "Computer Graphics Forum", "keywords": "user studies, interaction, perceptually-based rendering, rendering", "number": "6", "pages": "453-466", "title": "Learning Human Viewpoint Preferences from Sparsely Annotated Models", "type": "article", "url": "https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.14613", "volume": "41", "year": "2022" }, "he2016deep": { "author": "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian", "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition", "pages": "770--778", "title": "Deep residual learning for image recognition", "type": "inproceedings", "year": "2016" }, "hessel2021clipscore": { "author": "Hessel, Jack and Holtzman, Ari and Forbes, Maxwell and Bras, Ronan Le and Choi, Yejin", "journal": "arXiv preprint arXiv:2104.08718", "title": "Clipscore: A reference-free evaluation metric for image captioning", "type": "article", "year": "2021" }, "hinz2020semantic": { "author": "Hinz, Tobias and Heinrich, Stefan and Wermter, Stefan", "journal": "IEEE transactions on pattern analysis and machine intelligence", "number": "3", "pages": "1552--1565", "publisher": "IEEE", "title": "Semantic object accuracy for generative text-to-image synthesis", "type": "article", "volume": "44", "year": "2020" }, "ho2022imagen": { "archiveprefix": "arXiv", "author": "Jonathan Ho and William Chan and Chitwan Saharia and Jay Whang and Ruiqi Gao and Alexey Gritsenko and Diederik P. Kingma and Ben Poole and Mohammad Norouzi and David J. Fleet and Tim Salimans", "eprint": "2210.02303", "primaryclass": "cs.CV", "title": "Imagen Video: High Definition Video Generation with Diffusion Models", "type": "misc", "year": "2022" }, "ho2022video": { "archiveprefix": "arXiv", "author": "Jonathan Ho and Tim Salimans and Alexey Gritsenko and William Chan and Mohammad Norouzi and David J. Fleet", "eprint": "2204.03458", "primaryclass": "cs.CV", "title": "Video Diffusion Models", "type": "misc", "year": "2022" }, "hochreiter1997lstm": { "author": "Hochreiter, Sepp and Schmidhuber, J{\\\"u}rgen", "journal": "Neural computation", "number": "8", "pages": "1735--1780", "publisher": "MIT press", "title": "Long short-term memory", "type": "article", "volume": "9", "year": "1997" }, "hodosh2013framing": { "author": "Hodosh, Micah and Young, Peter and Hockenmaier, Julia", "journal": "Journal of Artificial Intelligence Research", "pages": "853--899", "title": "Framing image description as a ranking task: Data, models and evaluation metrics", "type": "article", "volume": "47", "year": "2013" }, "honnibal2017spacy": { "author": "Honnibal, Matthew and Montani, Ines", "journal": "To appear", "number": "1", "pages": "411--420", "title": "spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing", "type": "article", "volume": "7", "year": "2017" }, "hsieh2023sugarcrepe": { "author": "Cheng-Yu Hsieh and Jieyu Zhang and Zixian Ma and Aniruddha Kembhavi and Ranjay Krishna", "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track", "title": "SugarCrepe: Fixing Hackable Benchmarks for Vision-Language Compositionality", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "huang2023t2icompbench": { "author": "Kaiyi Huang and Kaiyue Sun and Enze Xie and Zhenguo Li and Xihui Liu", "journal": "arXiv preprint arXiv: 2307.06350", "title": "T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation", "type": "article", "year": "2023" }, "ijcai2022p759": { "author": "Cao, Min and Li, Shiping and Li, Juntao and Nie, Liqiang and Zhang, Min", "booktitle": "Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}", "doi": "10.24963/ijcai.2022/759", "editor": "Lud De Raedt", "month": "7", "note": "Survey Track", "pages": "5410--5417", "publisher": "International Joint Conferences on Artificial Intelligence Organization", "title": "Image-text Retrieval: A Survey on Recent Research and Development", "type": "inproceedings", "url": "https://doi.org/10.24963/ijcai.2022/759", "year": "2022" }, "jenkins2019unsupervised": { "author": "Jenkins, Porter and Farag, Ahmad and Wang, Suhang and Li, Zhenhui", "booktitle": "Proceedings of the 28th ACM international conference on information and knowledge management", "pages": "1993--2002", "title": "Unsupervised representation learning of spatial data via multimodal embedding", "type": "inproceedings", "year": "2019" }, "jiang2019tiger": { "author": "Jiang, Ming and Huang, Qiuyuan and Zhang, Lei and Wang, Xin and Zhang, Pengchuan and Gan, Zhe and Diesner, Jana and Gao, Jianfeng", "journal": "arXiv preprint arXiv:1909.02050", "title": "Tiger: Text-to-image grounding for image caption evaluation", "type": "article", "year": "2019" }, "kamath-etal-2023-text": { "address": "\"Singapore\",", "author": "\"Kamath, Amita and Hessel, Jack and Chang, Kai-Wei\",", "booktitle": "\"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing\",", "doi": "\"10.18653/v1/2023.emnlp-main.301\",", "editor": "\"Bouamor, Houda and Pino, Juan and Bali, Kalika\",", "month": "dec,", "pages": "\"4933--4944\"", "publisher": "\"Association for Computational Linguistics\",", "title": "\"Text encoders bottleneck compositionality in contrastive vision-language models\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2023.emnlp-main.301\",", "year": "\"2023\"," }, "kamath2023s": { "author": "Kamath, Amita and Hessel, Jack and Chang, Kai-Wei", "journal": "arXiv preprint arXiv:2310.19785", "title": "What's\" up\" with vision-language models? Investigating their struggle with spatial reasoning", "type": "article", "year": "2023" }, "kane-etal-2020-nubia": { "abstract": "\"We present NUBIA, a methodology to build automatic evaluation metrics for text generation using only machine learning models as core components. A typical NUBIA model is composed of three modules: a neural feature extractor, an aggregator and a calibrator. We demonstrate an implementation of NUBIA showing competitive performance with stateof-the art metrics used to evaluate machine translation and state-of-the art results for image captions quality evaluation. In addition to strong performance, NUBIA models have the advantage of being modular and improve in synergy with advances in text generation models.\",", "address": "\"Online (Dublin, Ireland)\",", "author": "\"Kane, Hassan and Kocyigit, Muhammed Yusuf and Abdalla, Ali and Ajanoh, Pelkins and Coulibali, Mohamed\",", "booktitle": "\"Proceedings of the 1st Workshop on Evaluating NLG Evaluation\",", "editor": "\"Agarwal, Shubham and Du{\\v{s}}ek, Ond{\\v{r}}ej and Gehrmann, Sebastian and Gkatzia, Dimitra and Konstas, Ioannis and Van Miltenburg, Emiel and Santhanam, Sashank\",", "month": "dec,", "pages": "\"28--37\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{NUBIA}: {N}e{U}ral Based Interchangeability Assessor for Text Generation\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2020.evalnlgeval-1.4\",", "year": "\"2020\"," }, "kang2023scaling": { "author": "Kang, Minguk and Zhu, Jun-Yan and Zhang, Richard and Park, Jaesik and Shechtman, Eli and Paris, Sylvain and Park, Taesung", "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition", "pages": "10124--10134", "title": "Scaling up gans for text-to-image synthesis", "type": "inproceedings", "year": "2023" }, "karras2019style": { "author": "Karras, Tero and Laine, Samuli and Aila, Timo", "booktitle": "Proceedings of the IEEE/CVF conference on computer vision and pattern recognition", "pages": "4401--4410", "title": "A style-based generator architecture for generative adversarial networks", "type": "inproceedings", "year": "2019" }, "khashabi2020unifiedqa": { "author": "Khashabi, Daniel and Min, Sewon and Khot, Tushar and Sabharwal, Ashish and Tafjord, Oyvind and Clark, Peter and Hajishirzi, Hannaneh", "journal": "arXiv preprint arXiv:2005.00700", "title": "Unifiedqa: Crossing format boundaries with a single qa system", "type": "article", "year": "2020" }, "kim2021vilt": { "author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo", "booktitle": "International Conference on Machine Learning", "organization": "PMLR", "pages": "5583--5594", "title": "Vilt: Vision-and-language transformer without convolution or region supervision", "type": "inproceedings", "year": "2021" }, "kim2022mutual": { "author": "Kim, Jin-Hwa and Kim, Yunji and Lee, Jiyoung and Yoo, Kang Min and Lee, Sang-Woo", "journal": "Advances in Neural Information Processing Systems", "pages": "35072--35086", "title": "Mutual information divergence: A unified metric for multimodal generative models", "type": "article", "volume": "35", "year": "2022" }, "kim2023imagine": { "author": "Yeongbin Kim and Gautam Singh and Junyeong Park and Caglar Gulcehre and Sungjin Ahn", "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track", "title": "Imagine the Unseen World: A Benchmark for Systematic Generalization in Visual World Models", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "kiros2014unifying": { "author": "Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S", "journal": "arXiv preprint arXiv:1411.2539", "title": "Unifying visual-semantic embeddings with multimodal neural language models", "type": "article", "year": "2014" }, "kiros2018illustrative": { "author": "Kiros, Jamie and Chan, William and Hinton, Geoffrey", "booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", "pages": "922--933", "title": "Illustrative language understanding: Large-scale visual grounding with image search", "type": "inproceedings", "year": "2018" }, "ku2023viescore": { "author": "Ku, Max and Jiang, Dongfu and Wei, Cong and Yue, Xiang and Chen, Wenhu", "journal": "arXiv preprint arXiv:2312.14867", "title": "VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation", "type": "article", "year": "2023" }, "kumar2023comprehensive": { "author": "Kumar, Satyam and Musharaf, Dayima and Musharaf, Seerat and Sagar, Anil Kumar", "booktitle": "International Conference on Advanced Communication and Intelligent Systems", "organization": "Springer", "pages": "90--103", "title": "A Comprehensive Review of the Latest Advancements in Large Generative AI Models", "type": "inproceedings", "year": "2023" }, "kuznetsova2020open": { "author": "Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and others", "journal": "International Journal of Computer Vision", "number": "7", "pages": "1956--1981", "publisher": "Springer", "title": "The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale", "type": "article", "volume": "128", "year": "2020" }, "kynkaanniemi2019improved": { "author": "Kynk{\\\"a}{\\\"a}nniemi, Tuomas and Karras, Tero and Laine, Samuli and Lehtinen, Jaakko and Aila, Timo", "journal": "Advances in Neural Information Processing Systems", "title": "Improved precision and recall metric for assessing generative models", "type": "article", "volume": "32", "year": "2019" }, "lavie2004significance": { "author": "Lavie, Alon and Sagae, Kenji and Jayaraman, Shyamsundar", "booktitle": "Machine Translation: From Real Users to Research: 6th Conference of the Association for Machine Translation in the Americas, AMTA 2004, Washington, DC, USA, September 28-October 2, 2004. Proceedings 6", "organization": "Springer", "pages": "134--143", "title": "The significance of recall in automatic metrics for MT evaluation", "type": "inproceedings", "year": "2004" }, "lee-etal-2020-vilbertscore": { "address": "\"Online\",", "author": "\"Lee, Hwanhee and Yoon, Seunghyun and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Jung, Kyomin\",", "booktitle": "\"Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems\",", "doi": "\"10.18653/v1/2020.eval4nlp-1.4\",", "editor": "\"Eger, Steffen and Gao, Yang and Peyrard, Maxime and Zhao, Wei and Hovy, Eduard\",", "month": "nov,", "pages": "\"34--39\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{V}i{LBERTS}core: Evaluating Image Caption Using Vision-and-Language {BERT}\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2020.eval4nlp-1.4\",", "year": "\"2020\"," }, "lee2018stacked": { "author": "Lee, Kuang-Huei and Chen, Xi and Hua, Gang and Hu, Houdong and He, Xiaodong", "booktitle": "Proceedings of the European conference on computer vision (ECCV)", "pages": "201--216", "title": "Stacked cross attention for image-text matching", "type": "inproceedings", "year": "2018" }, "lee2023aligning": { "archiveprefix": "arXiv", "author": "Kimin Lee and Hao Liu and Moonkyung Ryu and Olivia Watkins and Yuqing Du and Craig Boutilier and Pieter Abbeel and Mohammad Ghavamzadeh and Shixiang Shane Gu", "eprint": "2302.12192", "primaryclass": "cs.LG", "title": "Aligning Text-to-Image Models using Human Feedback", "type": "misc", "year": "2023" }, "lee2023holistic": { "author": "Tony Lee and Michihiro Yasunaga and Chenlin Meng and Yifan Mai and Joon Sung Park and Agrim Gupta and Yunzhi Zhang and Deepak Narayanan and Hannah Benita Teufel and Marco Bellagente and Minguk Kang and Taesung Park and Jure Leskovec and Jun-Yan Zhu and Li Fei-Fei and Jiajun Wu and Stefano Ermon and Percy Liang", "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track", "title": "Holistic Evaluation of Text-to-Image Models", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "lee2023text": { "author": "Lee, Jaewoong and Jang, Sangwon and Jo, Jaehyeong and Yoon, Jaehong and Kim, Yunji and Kim, Jin-Hwa and Ha, Jung-Woo and Hwang, Sung Ju", "journal": "arXiv preprint arXiv:2304.01515", "title": "Text-Conditioned Sampling Framework for Text-to-Image Generation with Masked Generative Models", "type": "article", "year": "2023" }, "lewis2022does": { "author": "Lewis, Martha and Nayak, Nihal V and Yu, Peilin and Yu, Qinan and Merullo, Jack and Bach, Stephen H and Pavlick, Ellie", "journal": "arXiv preprint arXiv:2212.10537", "title": "Does clip bind concepts? probing compositionality in large image models", "type": "article", "year": "2022" }, "li-etal-2022-mplug": { "abstract": "\"Large-scale pre-trained foundation models have been an emerging paradigm for building artificial intelligence (AI) systems, which can be quickly adapted to a wide range of downstream tasks. This paper presents mPLUG, a new vision-language foundation model for both cross-modal understanding and generation. Most existing pre-trained models suffer from inefficiency and linguistic signal overwhelmed by long visual sequences in cross-modal alignment. To address both problems, mPLUG introduces an effective and efficient vision-language architecture with novel cross-modal skip-connections.mPLUG is pre-trained end-to-end on large-scale image-text pairs with both discriminative and generative objectives. It achieves state-of-the-art results on a wide range of vision-language downstream tasks, including image captioning, image-text retrieval, visual grounding and visual question answering. mPLUG also demonstrates strong zero-shot transferability on vision-language and video-language tasks. The code and pre-trained models are available at \\url{https://github.com/alibaba/AliceMind}\",", "address": "\"Abu Dhabi, United Arab Emirates\",", "author": "\"Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, He and Xu, Guohai and Cao, Zheng and Zhang, Ji and Huang, Songfang and Huang, Fei and Zhou, Jingren and Si, Luo\",", "booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing\",", "doi": "\"10.18653/v1/2022.emnlp-main.488\",", "editor": "\"Goldberg, Yoav and Kozareva, Zornitsa and Zhang, Yue\",", "month": "dec,", "pages": "\"7241--7259\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"m{PLUG}: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2022.emnlp-main.488\",", "year": "\"2022\"," }, "li2020oscar": { "author": "Li, Xiujun and Yin, Xi and Li, Chunyuan and Zhang, Pengchuan and Hu, Xiaowei and Zhang, Lei and Wang, Lijuan and Hu, Houdong and Dong, Li and Wei, Furu and others", "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16", "organization": "Springer", "pages": "121--137", "title": "Oscar: Object-semantics aligned pre-training for vision-language tasks", "type": "inproceedings", "year": "2020" }, "li2022blip": { "author": "Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven", "booktitle": "International Conference on Machine Learning", "organization": "PMLR", "pages": "12888--12900", "title": "Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation", "type": "inproceedings", "year": "2022" }, "li2023covlm": { "author": "Li, Junyan and Chen, Delin and Hong, Yining and Chen, Zhenfang and Chen, Peihao and Shen, Yikang and Gan, Chuang", "journal": "arXiv preprint arXiv:2311.03354", "title": "CoVLM: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding", "type": "article", "year": "2023" }, "li2023divide": { "author": "Li, Yumeng and Keuper, Margret and Zhang, Dan and Khoreva, Anna", "journal": "arXiv preprint arXiv:2307.10864", "title": "Divide \\& bind your attention for improved generative semantic nursing", "type": "article", "year": "2023" }, "li2024compositional": { "author": "Junyan Li and Delin Chen and Yining Hong and Zhenfang Chen and Peihao Chen and Yikang Shen and Chuang Gan", "booktitle": "The Twelfth International Conference on Learning Representations", "title": "Compositional {VLM}: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2024" }, "liang2020cpgan": { "author": "Liang, Jiadong and Pei, Wenjie and Lu, Feng", "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16", "organization": "Springer", "pages": "491--508", "title": "Cpgan: Content-parsing generative adversarial networks for text-to-image synthesis", "type": "inproceedings", "year": "2020" }, "liang2023rich": { "author": "Liang, Youwei and He, Junfeng and Li, Gang and Li, Peizhao and Klimovskiy, Arseniy and Carolan, Nicholas and Sun, Jiao and Pont-Tuset, Jordi and Young, Sarah and Yang, Feng and others", "journal": "arXiv preprint arXiv:2312.10240", "title": "Rich Human Feedback for Text-to-Image Generation", "type": "article", "year": "2023" }, "lin-2004-rouge": { "address": "\"Barcelona, Spain\",", "author": "\"Lin, Chin-Yew\",", "booktitle": "\"Text Summarization Branches Out\",", "month": "jul,", "pages": "\"74--81\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{ROUGE}: A Package for Automatic Evaluation of Summaries\",", "type": "inproceedings", "url": "\"https://aclanthology.org/W04-1013\",", "year": "\"2004\"," }, "lin2014microsoft": { "author": "Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\\'a}r, Piotr and Zitnick, C Lawrence", "booktitle": "Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13", "organization": "Springer", "pages": "740--755", "title": "Microsoft coco: Common objects in context", "type": "inproceedings", "year": "2014" }, "lin2023revisiting": { "author": "Lin, Zhiqiu and Chen, Xinyue and Pathak, Deepak and Zhang, Pengchuan and Ramanan, Deva", "journal": "arXiv preprint arXiv:2306.01879", "title": "Revisiting the Role of Language Priors in Vision-Language Models", "type": "article", "year": "2023" }, "liu2021swin": { "author": "Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining", "booktitle": "Proceedings of the IEEE/CVF international conference on computer vision", "pages": "10012--10022", "title": "Swin transformer: Hierarchical vision transformer using shifted windows", "type": "inproceedings", "year": "2021" }, "liu2022compositional": { "author": "Liu, Nan and Li, Shuang and Du, Yilun and Torralba, Antonio and Tenenbaum, Joshua B", "booktitle": "European Conference on Computer Vision", "organization": "Springer", "pages": "423--439", "title": "Compositional visual generation with composable diffusion models", "type": "inproceedings", "year": "2022" }, "liu2024fetv": { "author": "Liu, Yuanxin and Li, Lei and Ren, Shuhuai and Gao, Rundong and Li, Shicheng and Chen, Sishuo and Sun, Xu and Hou, Lu", "journal": "Advances in Neural Information Processing Systems", "title": "Fetv: A benchmark for fine-grained evaluation of open-domain text-to-video generation", "type": "article", "volume": "36", "year": "2024" }, "lopez2016revisiting": { "author": "Lopez-Paz, David and Oquab, Maxime", "booktitle": "International Conference on Learning Representations", "title": "Revisiting Classifier Two-Sample Tests", "type": "inproceedings", "year": "2016" }, "lu2023llmscore": { "author": "Yujie Lu and Xianjun Yang and Xiujun Li and Xin Eric Wang and William Yang Wang", "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems", "title": "LLMScore: Unveiling the Power of Large Language Models in Text-to-Image Synthesis Evaluation", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "ma2023examination": { "author": "Ma, Teli and Li, Rong and Liang, Junwei", "journal": "arXiv preprint arXiv:2308.10509", "title": "An examination of the compositionality of large generative vision-language models", "type": "article", "year": "2023" }, "ma2024cobra": { "author": "Ma, Zheng and Wang, Changxin and Ouyang, Yawen and Zhao, Fei and Zhang, Jianbing and Huang, Shujian and Chen, Jiajun", "journal": "arXiv preprint arXiv:2402.11572", "title": "Cobra Effect in Reference-Free Image Captioning Metrics", "type": "article", "year": "2024" }, "madhyastha-etal-2019-vifidel": { "address": "\"Florence, Italy\",", "author": "\"Madhyastha, Pranava and Wang, Josiah and Specia, Lucia\",", "booktitle": "\"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",", "doi": "\"10.18653/v1/P19-1654\",", "editor": "\"Korhonen, Anna and Traum, David and M{\\`a}rquez, Llu{\\'\\i}s\",", "month": "jul,", "pages": "\"6539--6550\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{VIFIDEL}: Evaluating the Visual Fidelity of Image Descriptions\",", "type": "inproceedings", "url": "\"https://aclanthology.org/P19-1654\",", "year": "\"2019\"," }, "mao2016training": { "abstract": "In this paper, we focus on training and evaluating effective word embeddings with both text and visual information. More specifically, we introduce a large-scale dataset with 300 million sentences describing over 40 million images crawled and downloaded from publicly available Pins (i.e. an image with sentence descriptions uploaded by users) on Pinterest [2]. This dataset is more than 200 times larger than MS COCO [22], the standard large-scale image dataset with sentence descriptions. In addition, we construct an evaluation dataset to directly assess the effectiveness of word embeddings in terms of finding semantically similar or related words and phrases. The word/phrase pairs in this evaluation dataset are collected from the click data with millions of users in an image search system, thus contain rich semantic relationships. Based on these datasets, we propose and compare several Recurrent Neural Networks (RNNs) based multimodal (text and image) models. Experiments show that our model benefits from incorporating the visual information into the word embeddings, and a weight sharing strategy is crucial for learning such multimodal embeddings. The project page is: http://www.stat.ucla.edu/~junhua.mao/multimodal_embedding.html.", "address": "Red Hook, NY, USA", "author": "Mao, Junhua and Xu, Jiajing and Jing, Yushi and Yuille, Alan", "booktitle": "Proceedings of the 30th International Conference on Neural Information Processing Systems", "isbn": "9781510838819", "location": "Barcelona, Spain", "numpages": "9", "pages": "442\u2013450", "publisher": "Curran Associates Inc.", "series": "NIPS'16", "title": "Training and evaluating multimodal word embeddings with large-scale web annotated images", "type": "inproceedings", "year": "2016" }, "marcus2022preliminary": { "archiveprefix": "arXiv", "author": "Gary Marcus and Ernest Davis and Scott Aaronson", "eprint": "2204.13807", "primaryclass": "cs.CV", "title": "A very preliminary analysis of DALL-E 2", "type": "misc", "year": "2022" }, "menendez1997jensen": { "abstract": "In this paper we investigate the Jensen-Shannon parametric divergence for testing goodness-of-fit for point estimation. Most of the work presented is an analytical study of the asymptotic differences between different members of the family proposed in goodness of fit, together with an examination of closer approximations to the exact distribution of these statistics than the commonly used chi-squared distribution. Finally the minimum Jensen-Shannon divergence estimates are introduced and compared with other well-known estimators by computer simulation.", "author": "M.L. Men\u00e9ndez and J.A. Pardo and L. Pardo and M.C. Pardo", "doi": "https://doi.org/10.1016/S0016-0032(96)00063-4", "issn": "0016-0032", "journal": "Journal of the Franklin Institute", "number": "2", "pages": "307-318", "title": "The Jensen-Shannon divergence", "type": "article", "url": "https://www.sciencedirect.com/science/article/pii/S0016003296000634", "volume": "334", "year": "1997" }, "miller1995Wordnet": { "abstract": "Because meaningful sentences are composed of meaningful words, any system that hopes to process natural languages as people do must have information about words and their meanings. This information is traditionally provided through dictionaries, and machine-readable dictionaries are now widely available. But dictionary entries evolved for the convenience of human readers, not for machines. WordNet1 provides a more effective combination of traditional lexicographic information and modern computing. WordNet is an online lexical database designed for use under program control. English nouns, verbs, adjectives, and adverbs are organized into sets of synonyms, each representing a lexicalized concept. Semantic relations link the synonym sets [4].", "address": "New York, NY, USA", "author": "Miller, George A.", "doi": "10.1145/219717.219748", "issn": "0001-0782", "issue_date": "Nov. 1995", "journal": "Commun. ACM", "month": "nov", "number": "11", "numpages": "3", "pages": "39\u201341", "publisher": "Association for Computing Machinery", "title": "WordNet: a lexical database for English", "type": "article", "url": "https://doi.org/10.1145/219717.219748", "volume": "38", "year": "1995" }, "minderer2022simple": { "author": "Minderer, M and Gritsenko, A and Stone, A and Neumann, M and Weissenborn, D and Dosovitskiy, A and Mahendran, A and Arnab, A and Dehghani, M and Shen, Z and others", "journal": "arXiv preprint arXiv:2205.06230", "title": "Simple open-vocabulary object detection with vision transformers. arxiv 2022", "type": "article", "volume": "2", "year": "2022" }, "moorthy2011blind": { "author": "Moorthy, Anush Krishna and Bovik, Alan Conrad", "journal": "IEEE transactions on Image Processing", "number": "12", "pages": "3350--3364", "publisher": "IEEE", "title": "Blind image quality assessment: From natural scene statistics to perceptual quality", "type": "article", "volume": "20", "year": "2011" }, "openai2023gpt": { "author": "OpenAI, R", "journal": "View in Article", "pages": "13", "title": "Gpt-4 technical report. arxiv 2303.08774", "type": "article", "volume": "2", "year": "2023" }, "park2021benchmark": { "author": "Dong Huk Park and Samaneh Azadi and Xihui Liu and Trevor Darrell and Anna Rohrbach", "booktitle": "Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)", "title": "Benchmark for Compositional Text-to-Image Synthesis", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2021" }, "pascal-voc-2008": { "author": "\"Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.\",", "howpublished": "\"http://www.pascal-network.org/challenges/VOC/voc2008/workshop/index.html\"", "title": "\"The {PASCAL} {V}isual {O}bject {C}lasses {C}hallenge 2008 {(VOC2008)} {R}esults\",", "type": "misc" }, "plummer2015flickr30kentities": { "author": "Plummer, Bryan A. and Wang, Liwei and Cervantes, Chris M. and Caicedo, Juan C. and Hockenmaier, Julia and Lazebnik, Svetlana", "booktitle": "2015 IEEE International Conference on Computer Vision (ICCV)", "doi": "10.1109/ICCV.2015.303", "keywords": "Standards;Benchmark testing;Image resolution;Grounding;Glass;Training;Image color analysis", "number": "", "pages": "2641-2649", "title": "Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models", "type": "INPROCEEDINGS", "volume": "", "year": "2015" }, "pmlr-v139-cho21a": { "abstract": "\t {Existing methods for vision-and-language learning typically require designing task-specific architectures and objectives for each task. For example, a multi-label answer classifier for visual question answering, a region scorer for referring expression comprehension, and a language decoder for image captioning, etc. To alleviate these hassles, in this work, we propose a unified framework that learns different tasks in a single architecture with the same language modeling objective, i.e., multimodal conditional text generation, where our models learn to generate labels in text based on the visual and textual inputs. On 7 popular vision-and-language benchmarks, including visual question answering, referring expression comprehension, visual commonsense reasoning, most of which have been previously modeled as discriminative tasks, our generative approach (with a single unified architecture) reaches comparable performance to recent task-specific state-of-the-art vision-and-language models. Moreover, our generative approach shows better generalization ability on questions that have rare answers. Also, we show that our framework allows multi-task learning in a single architecture with a single set of parameters, achieving similar performance to separately optimized single-task models. Our code is publicly available at: https://github.com/j-min/VL-T5", "author": "Cho, Jaemin and Lei, Jie and Tan, Hao and Bansal, Mohit", "booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning", "editor": "\t {Meila, Marina and Zhang, Tong", "month": "\t {18--24 Jul", "pages": "\t {1931--1942", "pdf": "\t {http://proceedings.mlr.press/v139/cho21a/cho21a.pdf", "publisher": "PMLR", "series": "\t {Proceedings of Machine Learning Research", "title": "\t {Unifying Vision-and-Language Tasks via Text Generation", "type": "InProceedings", "url": "\t {https://proceedings.mlr.press/v139/cho21a.html", "volume": "\t {139", "year": "\t {2021" }, "pmlr-v139-kim21k": { "abstract": "\t {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.", "author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo", "booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning", "editor": "\t {Meila, Marina and Zhang, Tong", "month": "\t {18--24 Jul", "pages": "\t {5583--5594", "pdf": "\t {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf", "publisher": "PMLR", "series": "\t {Proceedings of Machine Learning Research", "title": "\t {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision", "type": "InProceedings", "url": "\t {http://proceedings.mlr.press/v139/kim21k.html", "volume": "\t {139", "year": "\t {2021" }, "pmlr-v37-kusnerb15": { "abstract": "\t {We present the Word Mover\u2019s Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local co-occurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to \"travel\" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover\u2019s Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.", "address": "\t {Lille, France", "author": "\t {Kusner, Matt and Sun, Yu and Kolkin, Nicholas and Weinberger, Kilian", "booktitle": "\t {Proceedings of the 32nd International Conference on Machine Learning", "editor": "\t {Bach, Francis and Blei, David", "month": "\t {07--09 Jul", "pages": "\t {957--966", "pdf": "\t {http://proceedings.mlr.press/v37/kusnerb15.pdf", "publisher": "PMLR", "series": "\t {Proceedings of Machine Learning Research", "title": "\t {From Word Embeddings To Document Distances", "type": "InProceedings", "url": "\t {https://proceedings.mlr.press/v37/kusnerb15.html", "volume": "\t {37", "year": "\t {2015" }, "po2023state": { "archiveprefix": "arXiv", "author": "Ryan Po and Wang Yifan and Vladislav Golyanik and Kfir Aberman and Jonathan T. Barron and Amit H. Bermano and Eric Ryan Chan and Tali Dekel and Aleksander Holynski and Angjoo Kanazawa and C. Karen Liu and Lingjie Liu and Ben Mildenhall and Matthias Nie\u00dfner and Bj\u00f6rn Ommer and Christian Theobalt and Peter Wonka and Gordon Wetzstein", "eprint": "2310.07204", "primaryclass": "cs.AI", "title": "State of the Art on Diffusion Models for Visual Computing", "type": "misc", "year": "2023" }, "prabhudesai2023aligning": { "archiveprefix": "arXiv", "author": "Mihir Prabhudesai and Anirudh Goyal and Deepak Pathak and Katerina Fragkiadaki", "eprint": "2310.03739", "primaryclass": "cs.CV", "title": "Aligning Text-to-Image Diffusion Models with Reward Backpropagation", "type": "misc", "year": "2023" }, "radford2019language": { "author": "Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others", "journal": "OpenAI blog", "number": "8", "pages": "9", "title": "Language models are unsupervised multitask learners", "type": "article", "volume": "1", "year": "2019" }, "radford2021learning": { "author": "Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others", "booktitle": "International conference on machine learning", "organization": "PMLR", "pages": "8748--8763", "title": "Learning transferable visual models from natural language supervision", "type": "inproceedings", "year": "2021" }, "ramesh2021zero": { "author": "Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya", "booktitle": "International conference on machine learning", "organization": "Pmlr", "pages": "8821--8831", "title": "Zero-shot text-to-image generation", "type": "inproceedings", "year": "2021" }, "rashtchian2010collecting": { "author": "Rashtchian, Cyrus and Young, Peter and Hodosh, Micah and Hockenmaier, Julia", "booktitle": "Proceedings of the NAACL HLT 2010 workshop on creating speech and language data with Amazon\u2019s Mechanical Turk", "pages": "139--147", "title": "Collecting image annotations using amazon\u2019s mechanical turk", "type": "inproceedings", "year": "2010" }, "ravuri2019classification": { "author": "Ravuri, Suman and Vinyals, Oriol", "journal": "Advances in neural information processing systems", "title": "Classification accuracy score for conditional generative models", "type": "article", "volume": "32", "year": "2019" }, "ray2024cola": { "author": "Ray, Arijit and Radenovic, Filip and Dubey, Abhimanyu and Plummer, Bryan and Krishna, Ranjay and Saenko, Kate", "journal": "Advances in Neural Information Processing Systems", "title": "cola: A Benchmark for Compositional Text-to-image Retrieval", "type": "article", "volume": "36", "year": "2024" }, "reed2016generative": { "author": "Reed, Scott and Akata, Zeynep and Yan, Xinchen and Logeswaran, Lajanugen and Schiele, Bernt and Lee, Honglak", "booktitle": "International conference on machine learning", "organization": "PMLR", "pages": "1060--1069", "title": "Generative adversarial text to image synthesis", "type": "inproceedings", "year": "2016" }, "reed2016learning": { "author": "Reed, Scott E and Akata, Zeynep and Mohan, Santosh and Tenka, Samuel and Schiele, Bernt and Lee, Honglak", "journal": "Advances in neural information processing systems", "title": "Learning what and where to draw", "type": "article", "volume": "29", "year": "2016" }, "reis2023real": { "author": "Reis, Dillon and Kupec, Jordan and Hong, Jacqueline and Daoudi, Ahmad", "journal": "arXiv preprint arXiv:2305.09972", "title": "Real-time flying object detection with YOLOv8", "type": "article", "year": "2023" }, "roberts2022scaling": { "archiveprefix": "arXiv", "author": "Adam Roberts and Hyung Won Chung and Anselm Levskaya and Gaurav Mishra and James Bradbury and Daniel Andor and Sharan Narang and Brian Lester and Colin Gaffney and Afroz Mohiuddin and Curtis Hawthorne and Aitor Lewkowycz and Alex Salcianu and Marc van Zee and Jacob Austin and Sebastian Goodman and Livio Baldini Soares and Haitang Hu and Sasha Tsvyashchenko and Aakanksha Chowdhery and Jasmijn Bastings and Jannis Bulian and Xavier Garcia and Jianmo Ni and Andrew Chen and Kathleen Kenealy and Jonathan H. Clark and Stephan Lee and Dan Garrette and James Lee-Thorp and Colin Raffel and Noam Shazeer and Marvin Ritter and Maarten Bosma and Alexandre Passos and Jeremy Maitin-Shepard and Noah Fiedel and Mark Omernick and Brennan Saeta and Ryan Sepassi and Alexander Spiridonov and Joshua Newlan and Andrea Gesmundo", "eprint": "2203.17189", "primaryclass": "cs.LG", "title": "Scaling Up Models and Data with $\\texttt{t5x}$ and $\\texttt{seqio}$", "type": "misc", "year": "2022" }, "rohrbach2016movie": { "archiveprefix": "arXiv", "author": "Anna Rohrbach and Atousa Torabi and Marcus Rohrbach and Niket Tandon and Christopher Pal and Hugo Larochelle and Aaron Courville and Bernt Schiele", "eprint": "1605.03705", "primaryclass": "cs.CV", "title": "Movie Description", "type": "misc", "year": "2016" }, "saharia2022photorealistic": { "author": "Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily L and Ghasemipour, Kamyar and Gontijo Lopes, Raphael and Karagol Ayan, Burcu and Salimans, Tim and others", "journal": "Advances in Neural Information Processing Systems", "pages": "36479--36494", "title": "Photorealistic text-to-image diffusion models with deep language understanding", "type": "article", "volume": "35", "year": "2022" }, "sajjadi2018assessing": { "author": "Sajjadi, Mehdi SM and Bachem, Olivier and Lucic, Mario and Bousquet, Olivier and Gelly, Sylvain", "journal": "Advances in neural information processing systems", "title": "Assessing generative models via precision and recall", "type": "article", "volume": "31", "year": "2018" }, "schuhmann2022laion": { "author": "Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others", "journal": "Advances in Neural Information Processing Systems", "pages": "25278--25294", "title": "Laion-5b: An open large-scale dataset for training next generation image-text models", "type": "article", "volume": "35", "year": "2022" }, "sharma2018conceptual": { "author": "Sharma, Piyush and Ding, Nan and Goodman, Sebastian and Soricut, Radu", "booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", "pages": "2556--2565", "title": "Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning", "type": "inproceedings", "year": "2018" }, "singh2023coarse": { "author": "Singh, Harman and Zhang, Pengchuan and Wang, Qifan and Wang, Mengjiao and Xiong, Wenhan and Du, Jingfei and Chen, Yu", "journal": "arXiv preprint arXiv:2305.13812", "title": "Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for Improved Vision-Language Compositionality", "type": "article", "year": "2023" }, "singh2023divide": { "author": "Singh, Jaskirat and Zheng, Liang", "journal": "arXiv preprint arXiv:2307.04749", "title": "Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback", "type": "article", "year": "2023" }, "suhr2017corpus": { "author": "Suhr, Alane and Lewis, Mike and Yeh, James and Artzi, Yoav", "booktitle": "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", "pages": "217--223", "title": "A corpus of natural language for visual reasoning", "type": "inproceedings", "year": "2017" }, "szegedy2015going": { "author": "Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew", "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition", "pages": "1--9", "title": "Going deeper with convolutions", "type": "inproceedings", "year": "2015" }, "szegedy2016inception": { "author": "Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew", "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition", "pages": "2818--2826", "title": "Rethinking the inception architecture for computer vision", "type": "inproceedings", "year": "2016" }, "tan2019lxmert": { "author": "Tan, Hao and Bansal, Mohit", "journal": "arXiv preprint arXiv:1908.07490", "title": "Lxmert: Learning cross-modality encoder representations from transformers", "type": "article", "year": "2019" }, "wang-etal-2023-diffusiondb": { "abstract": "\"With recent advancements in diffusion models, users can generate high-quality images by writing text prompts in natural language. However, generating images with desired details requires proper prompts, and it is often unclear how a model reacts to different prompts or what the best prompts are. To help researchers tackle these critical challenges, we introduce DiffusionDB, the first large-scale text-to-image prompt dataset totaling 6.5TB, containing 14 million images generated by Stable Diffusion, 1.8 million unique prompts, and hyperparameters specified by real users. We analyze the syntactic and semantic characteristics of prompts. We pinpoint specific hyperparameter values and prompt styles that can lead to model errors and present evidence of potentially harmful model usage, such as the generation of misinformation. The unprecedented scale and diversity of this human-actuated dataset provide exciting research opportunities in understanding the interplay between prompts and generative models, detecting deepfakes, and designing human-AI interaction tools to help users more easily use these models. DiffusionDB is publicly available at: \\url{https://poloclub.github.io/diffusiondb}.\",", "address": "\"Toronto, Canada\",", "author": "\"Wang, Zijie J. and Montoya, Evan and Munechika, David and Yang, Haoyang and Hoover, Benjamin and Chau, Duen Horng\",", "booktitle": "\"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\",", "doi": "\"10.18653/v1/2023.acl-long.51\",", "editor": "\"Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki\",", "month": "jul,", "pages": "\"893--911\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{D}iffusion{DB}: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2023.acl-long.51\",", "year": "\"2023\"," }, "wang-gaizauskas-2015-generating": { "address": "\"Brighton, UK\",", "author": "\"Wang, Josiah and Gaizauskas, Robert\",", "booktitle": "\"Proceedings of the 15th {E}uropean Workshop on Natural Language Generation ({ENLG})\",", "doi": "\"10.18653/v1/W15-4722\",", "editor": "\"Belz, Anya and Gatt, Albert and Portet, Fran{\\c{c}}ois and Purver, Matthew\",", "month": "sep,", "pages": "\"117--126\",", "publisher": "\"Association for Computational Linguistics\",", "title": "\"Generating Image Descriptions with Gold Standard Visual Inputs: Motivation, Evaluation and Baselines\",", "type": "inproceedings", "url": "\"https://aclanthology.org/W15-4722\",", "year": "\"2015\"," }, "wang2003multiscale": { "author": "Wang, Zhou and Simoncelli, Eero P and Bovik, Alan C", "booktitle": "The Thrity-Seventh Asilomar Conference on Signals, Systems \\& Computers, 2003", "organization": "Ieee", "pages": "1398--1402", "title": "Multiscale structural similarity for image quality assessment", "type": "inproceedings", "volume": "2", "year": "2003" }, "wang2004image": { "author": "Wang, Zhou and Bovik, Alan C and Sheikh, Hamid R and Simoncelli, Eero P", "journal": "IEEE transactions on image processing", "number": "4", "pages": "600--612", "publisher": "IEEE", "title": "Image quality assessment: from error visibility to structural similarity", "type": "article", "volume": "13", "year": "2004" }, "wang2023clipiqa": { "author": "Wang, Jianyi and Chan, Kelvin CK and Loy, Chen Change", "booktitle": "Proceedings of the AAAI Conference on Artificial Intelligence", "number": "2", "pages": "2555--2563", "title": "Exploring clip for assessing the look and feel of images", "type": "inproceedings", "volume": "37", "year": "2023" }, "wolff2023the": { "author": "Max Wolff and Wieland Brendel and Stuart Wolff", "booktitle": "ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models", "title": "The Independent Compositional Subspace Hypothesis for the Structure of {CLIP}'s Last Layer", "type": "inproceedings", "url": "https://openreview.net/forum?id", "year": "2023" }, "wu2022grit": { "author": "Wu, Jialian and Wang, Jianfeng and Yang, Zhengyuan and Gan, Zhe and Liu, Zicheng and Yuan, Junsong and Wang, Lijuan", "journal": "arXiv preprint arXiv:2212.00280", "title": "Grit: A generative region-to-text transformer for object understanding", "type": "article", "year": "2022" }, "xie2019visual": { "archiveprefix": "arXiv", "author": "Ning Xie and Farley Lai and Derek Doran and Asim Kadav", "eprint": "1811.10582", "primaryclass": "cs.CV", "title": "Visual Entailment Task for Visually-Grounded Language Learning", "type": "misc", "year": "2019" }, "xu-etal-2023-metarevision": { "address": "\"Singapore\",", "author": "\"Xu, Guangyue and Kordjamshidi, Parisa and Chai, Joyce\",", "booktitle": "\"Findings of the Association for Computational Linguistics: EMNLP 2023\",", "doi": "\"10.18653/v1/2023.findings-emnlp.818\",", "editor": "\"Bouamor, Houda and Pino, Juan and Bali, Kalika\",", "month": "dec,", "pages": "\"12224--12236\"", "publisher": "\"Association for Computational Linguistics\",", "title": "\"{M}eta{R}e{V}ision: Meta-Learning with Retrieval for Visually Grounded Compositional Concept Acquisition\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2023.findings-emnlp.818\",", "year": "\"2023\"," }, "xu2023imagereward": { "author": "Xu, Jiazheng and Liu, Xiao and Wu, Yuchen and Tong, Yuxuan and Li, Qinkai and Ding, Ming and Tang, Jie and Dong, Yuxiao", "journal": "arXiv preprint arXiv:2304.05977", "title": "Imagereward: Learning and evaluating human preferences for text-to-image generation", "type": "article", "year": "2023" }, "yang2023diffusion": { "author": "Yang, Ling and Zhang, Zhilong and Song, Yang and Hong, Shenda and Xu, Runsheng and Zhao, Yue and Zhang, Wentao and Cui, Bin and Yang, Ming-Hsuan", "journal": "ACM Computing Surveys", "number": "4", "pages": "1--39", "publisher": "ACM New York, NY, USA", "title": "Diffusion models: A comprehensive survey of methods and applications", "type": "article", "volume": "56", "year": "2023" }, "yarom2023seetrue": { "archiveprefix": "arXiv", "author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran Ofek and Idan Szpektor", "eprint": "2305.10400", "primaryclass": "cs.CL", "title": "What You See is What You Read? Improving Text-Image Alignment Evaluation", "type": "misc", "year": "2023" }, "young2014flickr30k": { "abstract": "\"We propose to use the visual denotations of linguistic expressions (i.e. the set of images they describe) to define novel denotational similarity metrics, which we show to be at least as beneficial as distributional similarities for two tasks that require semantic inference. To compute these denotational similarities, we construct a denotation graph, i.e. a subsumption hierarchy over constituents and their denotations, based on a large corpus of 30K images and 150K descriptive captions.\",", "address": "\"Cambridge, MA\",", "author": "\"Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia\",", "doi": "\"10.1162/tacl_a_00166\",", "editor": "\"Lin, Dekang and Collins, Michael and Lee, Lillian\",", "journal": "\"Transactions of the Association for Computational Linguistics\",", "pages": "\"67--78\",", "publisher": "\"MIT Press\",", "title": "\"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions\",", "type": "article", "url": "\"https://aclanthology.org/Q14-1006\",", "volume": "\"2\",", "year": "\"2014\"," }, "young2014image": { "author": "Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia", "journal": "Transactions of the Association for Computational Linguistics", "pages": "67--78", "publisher": "MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~\u2026", "title": "From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions", "type": "article", "volume": "2", "year": "2014" }, "yuksekgonul2022and": { "author": "Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James", "booktitle": "The Eleventh International Conference on Learning Representations", "title": "When and Why Vision-Language Models Behave like Bags-Of-Words, and What to Do About It?", "type": "inproceedings", "year": "2022" }, "zelaszczyk2024text": { "author": "{\\.Z}elaszczyk, Maciej and Ma{\\'n}dziuk, Jacek", "journal": "arXiv preprint arXiv:2401.11631", "title": "Text-to-Image Cross-Modal Generation: A Systematic Review", "type": "article", "year": "2024" }, "zhang2011fsim": { "author": "Zhang, Lin and Zhang, Lei and Mou, Xuanqin and Zhang, David", "journal": "IEEE transactions on Image Processing", "number": "8", "pages": "2378--2386", "publisher": "IEEE", "title": "FSIM: A feature similarity index for image quality assessment", "type": "article", "volume": "20", "year": "2011" }, "zhang2018unreasonable": { "author": "Zhang, Richard and Isola, Phillip and Efros, Alexei A and Shechtman, Eli and Wang, Oliver", "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition", "pages": "586--595", "title": "The unreasonable effectiveness of deep features as a perceptual metric", "type": "inproceedings", "year": "2018" }, "zhang2022perceptual": { "author": "Zhang, Lingzhi and Zhou, Yuqian and Barnes, Connelly and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo", "booktitle": "European Conference on Computer Vision", "organization": "Springer", "pages": "146--164", "title": "Perceptual artifacts localization for inpainting", "type": "inproceedings", "year": "2022" }, "zhang2023perceptual": { "author": "Zhang, Lingzhi and Xu, Zhengjie and Barnes, Connelly and Zhou, Yuqian and Liu, Qing and Zhang, He and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo", "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision", "pages": "7579--7590", "title": "Perceptual Artifacts Localization for Image Synthesis Tasks", "type": "inproceedings", "year": "2023" }, "zhang2023spot": { "author": "Zhang, Gengyuan and Bi, Jinhe and Gu, Jindong and Tresp, Volker", "journal": "arXiv preprint arXiv:2311.12919", "title": "SPOT! Revisiting Video-Language Models for Event Understanding", "type": "article", "year": "2023" }, "zhang2023texttoimage": { "archiveprefix": "arXiv", "author": "Chenshuang Zhang and Chaoning Zhang and Mengchun Zhang and In So Kweon", "eprint": "2303.07909", "primaryclass": "cs.CV", "title": "Text-to-image Diffusion Models in Generative AI: A Survey", "type": "misc", "year": "2023" }, "zhao-etal-2022-explainable": { "address": "\"Abu Dhabi, UAE\",", "author": "\"Zhao, Tiancheng and Zhang, Tianqi and Zhu, Mingwei and Shen, Haozhan and Lee, Kyusong and Lu, Xiaopeng and Yin, Jianwei\",", "booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\",", "doi": "\"10.18653/v1/2022.emnlp-demos.4\",", "editor": "\"Che, Wanxiang and Shutova, Ekaterina\",", "month": "dec,", "pages": "\"30--37\"", "publisher": "\"Association for Computational Linguistics\",", "title": "\"An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models\",", "type": "inproceedings", "url": "\"https://aclanthology.org/2022.emnlp-demos.4\",", "year": "\"2022\"," }, "zhu2023contrastive": { "author": "Zhu, Xiangru and Sun, Penglei and Wang, Chengyu and Liu, Jingping and Li, Zhixu and Xiao, Yanghua and Huang, Jun", "journal": "arXiv preprint arXiv:2312.02338", "title": "A Contrastive Compositional Benchmark for Text-to-Image Synthesis: A Study with Unified Text-to-Image Fidelity Metrics", "type": "article", "year": "2023" } };