File size: 127,059 Bytes
d736789 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 |
const generatedBibEntries = {
"10.1007/978-3-319-46454-1_24": {
"abstract": "\"There is considerable interest in the task of automatically generating image captions. However, evaluation is challenging. Existing automatic evaluation metrics are primarily sensitive to n-gram overlap, which is neither necessary nor sufficient for the task of simulating human judgment. We hypothesize that semantic propositional content is an important component of human caption evaluation, and propose a new automated caption evaluation metric defined over scene graphs coined SPICE. Extensive evaluations across a range of models and datasets indicate that SPICE captures human judgments over model-generated captions better than other automatic metrics (e.g., system-level correlation of 0.88 with human judgments on the MS COCO dataset, versus 0.43 for CIDEr and 0.53 for METEOR). Furthermore, SPICE can answer questions such as which caption-generator best understands colors? and can caption-generators count?\",",
"address": "\"Cham\",",
"author": "\"Anderson, Peter and Fernando, Basura and Johnson, Mark and Gould, Stephen\",",
"booktitle": "\"Computer Vision -- ECCV 2016\",",
"editor": "\"Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max\",",
"isbn": "\"978-3-319-46454-1\"",
"pages": "\"382--398\",",
"publisher": "\"Springer International Publishing\",",
"title": "\"SPICE: Semantic Propositional Image Caption Evaluation\",",
"type": "InProceedings",
"year": "\"2016\","
},
"10.1016/j.neunet.2021.07.019": {
"address": "GBR",
"author": "Frolov, Stanislav and Hinz, Tobias and Raue, Federico and Hees, J\\\"{o}rn and Dengel, Andreas",
"doi": "10.1016/j.neunet.2021.07.019",
"issn": "0893-6080",
"issue_date": "Dec 2021",
"journal": "Neural Netw.",
"keywords": "Generative adversarial networks, Text-to-image synthesis",
"month": "dec",
"number": "C",
"numpages": "23",
"pages": "187\u2013209",
"publisher": "Elsevier Science Ltd.",
"title": "Adversarial text-to-image synthesis: A review",
"type": "article",
"url": "https://doi.org/10.1016/j.neunet.2021.07.019",
"volume": "144",
"year": "2021"
},
"10.1145/1809028.1806638": {
"abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.",
"address": "New York, NY, USA",
"author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan",
"doi": "10.1145/1809028.1806638",
"issn": "0362-1340",
"issue_date": "June 2010",
"journal": "SIGPLAN Not.",
"keywords": "data-parallel programming, java, mapreduce",
"month": "jun",
"number": "6",
"numpages": "13",
"pages": "363\u2013375",
"publisher": "Association for Computing Machinery",
"title": "FlumeJava: easy, efficient data-parallel pipelines",
"type": "article",
"url": "https://doi.org/10.1145/1809028.1806638",
"volume": "45",
"year": "2010"
},
"10.1145/3461353.3461388": {
"abstract": "In recent years, deep learning technology has made breakthroughs in computer vision. After using large-scale data training, the deep neural network represented by GAN is significantly better than previous technologies in image generation, including generating more reasonable, higher-definition, more complex, and more accurate images. With the continuous development of datasets, models, and applications, the fusion of different modal information, including fusion of natural language, semantic layouts, tags, edge maps, and other different modal information, to generate images has become a new demand and challenge. There are related reviews on image generation and multimodal deep learning. However, there has not been a review dedicated to multimodal deep-learning image generation to discuss the current status, existing problems, and challenges of this task. Therefore, this review proposes a survey on multimodal deep learning image generation. It aims to provide readers with an application scenario for multimodal deep learning image generation. Also, it provides readers with new multimodal deep learning image generation technologies, the relevant datasets, evaluation metrics used, and some results comparison. Finally, this article describes some of the challenges and future topics of multimodal deep learning image generation.",
"address": "New York, NY, USA",
"author": "Luo, Sanbi",
"booktitle": "Proceedings of the 2021 5th International Conference on Innovation in Artificial Intelligence",
"doi": "10.1145/3461353.3461388",
"isbn": "9781450388634",
"keywords": "multimodal, machine learning, image synthesis, deep learning, computer vision",
"location": "Xia men, China",
"numpages": "13",
"pages": "108\u2013120",
"publisher": "Association for Computing Machinery",
"series": "ICIAI '21",
"title": "A Survey on Multimodal Deep Learning for Image Synthesis: Applications, methods, datasets, evaluation metrics, and results comparison",
"type": "inproceedings",
"url": "https://doi.org/10.1145/3461353.3461388",
"year": "2021"
},
"10.3115/1073083.1073135": {
"abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations.",
"address": "USA",
"author": "Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing",
"booktitle": "Proceedings of the 40th Annual Meeting on Association for Computational Linguistics",
"doi": "10.3115/1073083.1073135",
"location": "Philadelphia, Pennsylvania",
"numpages": "8",
"pages": "311\u2013318",
"publisher": "Association for Computational Linguistics",
"series": "ACL '02",
"title": "BLEU: a method for automatic evaluation of machine translation",
"type": "inproceedings",
"url": "https://doi.org/10.3115/1073083.1073135",
"year": "2002"
},
"10.5555/3295222.3295408": {
"address": "Red Hook, NY, USA",
"author": "Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp",
"isbn": "9781510860964",
"location": "Long Beach, California, USA",
"numpages": "12",
"pages": "6629\u20136640",
"publisher": "Curran Associates Inc.",
"series": "NIPS'17",
"title": "GANs trained by a two time-scale update rule converge to a local nash equilibrium",
"type": "inproceedings",
"year": "2017"
},
"10081412": {
"author": "Croitoru, Florinel-Alin and Hondru, Vlad and Ionescu, Radu Tudor and Shah, Mubarak",
"doi": "10.1109/TPAMI.2023.3261988",
"journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
"keywords": "Computational modeling;Mathematical models;Noise reduction;Data models;Computer vision;Training;Task analysis;Denoising diffusion models;deep generative modeling;diffusion models;image generation;noise conditioned score networks;score-based models",
"number": "9",
"pages": "10850-10869",
"title": "Diffusion Models in Vision: A Survey",
"type": "ARTICLE",
"volume": "45",
"year": "2023"
},
"10123038": {
"author": "Xu, Peng and Zhu, Xiatian and Clifton, David A.",
"doi": "10.1109/TPAMI.2023.3275156",
"journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
"keywords": "Transformers;Task analysis;Surveys;Visualization;Taxonomy;Mathematical models;Data models;Multimodal learning;transformer;introductory;taxonomy;deep learning;machine learning",
"number": "10",
"pages": "12113-12132",
"title": "Multimodal Learning With Transformers: A Survey",
"type": "ARTICLE",
"volume": "45",
"year": "2023"
},
"10218041": {
"author": "Abdulghanni, Sara Faez and Abdulmunem, Ashwan A.",
"booktitle": "2023 Al-Sadiq International Conference on Communication and Information Technology (AICCIT)",
"doi": "10.1109/AICCIT57614.2023.10218041",
"keywords": "Measurement;Training;Surveys;Deep learning;Image synthesis;Transforms;Stability analysis;Image generation;Diffusion model;Stable diffusion;Generative models;Deep learning",
"number": "",
"pages": "171-175",
"title": "Image Generation Conditioned on Text Using Deep Learning Models: Survey",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2023"
},
"4176674": {
"author": "Dolan, Brian",
"booktitle": "2006 Fortieth Asilomar Conference on Signals, Systems and Computers",
"doi": "10.1109/ACSSC.2006.354864",
"keywords": "Mammography;Computer vision;Design automation;Humans;Medical diagnostic imaging;Breast cancer;Protocols;Biopsy;Gold;Visualization",
"number": "",
"pages": "821-825",
"title": "Computer Aided Diagnosis in Mammography: Its Development and Early Challenges",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2006"
},
"5412098": {
"author": "Thung, Kim-Han and Raveendran, Paramesran",
"booktitle": "2009 International Conference for Technical Postgraduates (TECHPOS)",
"doi": "10.1109/TECHPOS.2009.5412098",
"keywords": "Image quality;Digital images;Signal processing;Image coding;Image storage;Signal processing algorithms;Noise reduction;Video compression;PSNR;Gaussian noise",
"number": "",
"pages": "1-4",
"title": "A survey of image quality measures",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2009"
},
"5430991": {
"author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe",
"doi": "10.1109/LSP.2010.2045550",
"journal": "IEEE Signal Processing Letters",
"keywords": "Discrete cosine transforms;Image quality;Statistics;Feature extraction;Layout;Distortion measurement;Support vector machines;Testing;Machine learning algorithms;Machine learning;Anisotropy;discrete cosine transform;kurtosis;natural scene statistics;no-reference quality assessment",
"number": "6",
"pages": "583-586",
"title": "A DCT Statistics-Based Blind Image Quality Index",
"type": "ARTICLE",
"volume": "17",
"year": "2010"
},
"5596999": {
"author": "Hor\u00e9, Alain and Ziou, Djemel",
"booktitle": "2010 20th International Conference on Pattern Recognition",
"doi": "10.1109/ICPR.2010.579",
"keywords": "PSNR;Degradation;Image quality;Additives;Transform coding;Sensitivity;Image coding;PSNR;SSIM;image quality metrics",
"number": "",
"pages": "2366-2369",
"title": "Image Quality Metrics: PSNR vs. SSIM",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2010"
},
"5995446": {
"author": "Tang, Huixuan and Joshi, Neel and Kapoor, Ashish",
"booktitle": "CVPR 2011",
"doi": "10.1109/CVPR.2011.5995446",
"keywords": "Image quality;Distortion measurement;Transform coding;Histograms;Noise;Kernel;Degradation",
"number": "",
"pages": "305-312",
"title": "Learning a blind measure of perceptual image quality",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2011"
},
"6165361": {
"author": "Ye, Peng and Doermann, David",
"doi": "10.1109/TIP.2012.2190086",
"journal": "IEEE Transactions on Image Processing",
"keywords": "Feature extraction;Training;Image quality;Visualization;Transform coding;Databases;Image coding;Gabor filter;no-reference image quality assessment (NRIQA);texture analysis;visual codebook",
"number": "7",
"pages": "3129-3138",
"title": "No-Reference Image Quality Assessment Using Visual Codebooks",
"type": "ARTICLE",
"volume": "21",
"year": "2012"
},
"6172573": {
"author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe",
"doi": "10.1109/TIP.2012.2191563",
"journal": "IEEE Transactions on Image Processing",
"keywords": "Discrete cosine transforms;Feature extraction;Visualization;Humans;Computational modeling;Predictive models;Image quality;Discrete cosine transform (DCT);generalized Gaussian density;natural scene statistics;no-reference image quality assessment",
"number": "8",
"pages": "3339-3352",
"title": "Blind Image Quality Assessment: A Natural Scene Statistics Approach in the DCT Domain",
"type": "ARTICLE",
"volume": "21",
"year": "2012"
},
"6190099": {
"author": "Mittal, Anish and Moorthy, Anush K. and Bovik, Alan C.",
"booktitle": "2011 Conference Record of the Forty Fifth Asilomar Conference on Signals, Systems and Computers (ASILOMAR)",
"doi": "10.1109/ACSSC.2011.6190099",
"keywords": "Humans;Image quality;Transform coding;Correlation;Databases;Measurement;Computational modeling",
"number": "",
"pages": "723-727",
"title": "Blind/Referenceless Image Spatial Quality Evaluator",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2011"
},
"6353522": {
"author": "Mittal, Anish and Soundararajan, Rajiv and Bovik, Alan C.",
"doi": "10.1109/LSP.2012.2227726",
"journal": "IEEE Signal Processing Letters",
"keywords": "Image quality;Image processing;Statistical analysis;Feature extraction;Completely blind;distortion free;image quality assessment;no reference",
"number": "3",
"pages": "209-212",
"title": "Making a \u201cCompletely Blind\u201d Image Quality Analyzer",
"type": "ARTICLE",
"volume": "20",
"year": "2013"
},
"7084843": {
"author": "Venkatanath N and Praneeth D and Maruthi Chandrasekhar Bh and Channappayya, Sumohana S. and Medasani, Swarup S.",
"booktitle": "2015 Twenty First National Conference on Communications (NCC)",
"doi": "10.1109/NCC.2015.7084843",
"keywords": "Image quality;Noise;Databases;Feature extraction;Standards;Transform coding;Image segmentation;No reference image quality assessment;spatial activity;Perceptual quality",
"number": "",
"pages": "1-6",
"title": "Blind image quality evaluation using perception based features",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2015"
},
"8195348": {
"author": "Wu, Xian and Xu, Kun and Hall, Peter",
"doi": "10.23919/TST.2017.8195348",
"journal": "Tsinghua Science and Technology",
"keywords": "Gallium nitride;Image generation;Generators;Image resolution;Feature extraction;Training;Linear programming;image synthesis;image editing;constrained image synthesis;generative adversarial networks;image-to-image translation",
"number": "6",
"pages": "660-674",
"title": "A survey of image synthesis and editing with generative adversarial networks",
"type": "ARTICLE",
"volume": "22",
"year": "2017"
},
"8578241": {
"address": "Los Alamitos, CA, USA",
"author": "T. Xu and P. Zhang and Q. Huang and H. Zhang and Z. Gan and X. Huang and X. He",
"booktitle": "2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"doi": "10.1109/CVPR.2018.00143",
"issn": "",
"keywords": "gallium nitride;generative adversarial networks;computational modeling;image generation;generators;semantics;visualization",
"month": "jun",
"pages": "1316-1324",
"publisher": "IEEE Computer Society",
"title": "AttnGAN: Fine-Grained Text to Image Generation with Attentional Generative Adversarial Networks",
"type": "INPROCEEDINGS",
"url": "https://doi.ieeecomputersociety.org/10.1109/CVPR.2018.00143",
"volume": "",
"year": "2018"
},
"9495208": {
"author": "Xia, Jiazhi and Lin, Weixing and Jiang, Guang and Wang, Yunhai and Chen, Wei and Schreck, Tobias",
"doi": "10.1109/MCG.2021.3098804",
"journal": "IEEE Computer Graphics and Applications",
"keywords": "Visualization;Shape analysis;Visual perception;Clustering algorithms;Deep learning;Splines (mathematics)",
"number": "5",
"pages": "79-89",
"title": "Visual Clustering Factors in Scatterplots",
"type": "ARTICLE",
"volume": "41",
"year": "2021"
},
"AUTOMATIC1111_Stable_Diffusion_Web_2022": {
"author": "AUTOMATIC1111",
"month": "aug,",
"title": "Stable Diffusion Web UI",
"type": "software",
"url": "https://github.com/AUTOMATIC1111/stable-diffusion-webui",
"year": "2022"
},
"Agrawal_2019_ICCV": {
"author": "Agrawal, Harsh and Desai, Karan and Wang, Yufei and Chen, Xinlei and Jain, Rishabh and Johnson, Mark and Batra, Dhruv and Parikh, Devi and Lee, Stefan and Anderson, Peter",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"title": "nocaps: novel object captioning at scale",
"type": "InProceedings",
"year": "2019"
},
"Antol_2015_ICCV": {
"author": "Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C. Lawrence and Parikh, Devi",
"booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)",
"month": "December",
"title": "VQA: Visual Question Answering",
"type": "InProceedings",
"year": "2015"
},
"BahdanauCB14": {
"author": "Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio",
"bibsource": "dblp computer science bibliography, https://dblp.org",
"biburl": "https://dblp.org/rec/journals/corr/BahdanauCB14.bib",
"booktitle": "3rd International Conference on Learning Representations, {ICLR} 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings",
"editor": "Yoshua Bengio and Yann LeCun",
"timestamp": "Wed, 17 Jul 2019 10:40:54 +0200",
"title": "Neural Machine Translation by Jointly Learning to Align and Translate",
"type": "inproceedings",
"url": "http://arxiv.org/abs/1409.0473",
"year": "2015"
},
"Chang2023": {
"author": "\"Nadine Chang\",",
"doi": "\"10.1184/R1/23396759.v1\"",
"month": "\"6\",",
"title": "\"{Bridging the Gap Between Human Vision and Computer Vision}\",",
"type": "article",
"url": "\"https://kilthub.cmu.edu/articles/thesis/Bridging_the_Gap_Between_Human_Vision_and_Computer_Vision/23396759\",",
"year": "\"2023\","
},
"Changpinyo_2021_CVPR": {
"author": "Changpinyo, Soravit and Sharma, Piyush and Ding, Nan and Soricut, Radu",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "3558-3568",
"title": "Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts",
"type": "InProceedings",
"year": "2021"
},
"Chen_2018_CVPR": {
"author": "Chen, Jingwen and Chen, Jiawei and Chao, Hongyang and Yang, Ming",
"booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "Image Blind Denoising With Generative Adversarial Network Based Noise Modeling",
"type": "InProceedings",
"year": "2018"
},
"Chen_2023_ICCV": {
"author": "Chen, Rui and Chen, Yongwei and Jiao, Ningxin and Jia, Kui",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"pages": "22246-22256",
"title": "Fantasia3D: Disentangling Geometry and Appearance for High-quality Text-to-3D Content Creation",
"type": "InProceedings",
"year": "2023"
},
"Chen_2024_WACV": {
"author": "Chen, Minghao and Laina, Iro and Vedaldi, Andrea",
"booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
"month": "January",
"pages": "5343-5353",
"title": "Training-Free Layout Control With Cross-Attention Guidance",
"type": "InProceedings",
"year": "2024"
},
"Cho_2023_ICCV": {
"author": "Cho, Jaemin and Zala, Abhay and Bansal, Mohit",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"pages": "3043-3054",
"title": "DALL-Eval: Probing the Reasoning Skills and Social Biases of Text-to-Image Generation Models",
"type": "InProceedings",
"year": "2023"
},
"Cui_2018_CVPR": {
"author": "Cui, Yin and Yang, Guandao and Veit, Andreas and Huang, Xun and Belongie, Serge",
"booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "Learning to Evaluate Image Captioning",
"type": "InProceedings",
"year": "2018"
},
"Desai_2021_CVPR": {
"author": "Desai, Karan and Johnson, Justin",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "11162-11173",
"title": "VirTex: Learning Visual Representations From Textual Annotations",
"type": "InProceedings",
"year": "2021"
},
"Gokhale2022BenchmarkingSR": {
"author": "Tejas Gokhale and Hamid Palangi and Besmira Nushi and Vibhav Vineet and Eric Horvitz and Ece Kamar and Chitta Baral and Yezhou Yang",
"journal": "ArXiv",
"title": "Benchmarking Spatial Relationships in Text-to-Image Generation",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:254877055",
"volume": "abs/2212.10015",
"year": "2022"
},
"Goyal_2017_CVPR": {
"author": "Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi",
"booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "July",
"title": "Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering",
"type": "InProceedings",
"year": "2017"
},
"Grimal_2024_tiam": {
"author": "Grimal, Paul and Le Borgne, Herv\\'e and Ferret, Olivier and Tourille, Julien",
"booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
"month": "January",
"pages": "2890-2899",
"title": "TIAM - A Metric for Evaluating Alignment in Text-to-Image Generation",
"type": "InProceedings",
"year": "2024"
},
"Hu_2021_ICCV": {
"author": "Hu, Ronghang and Singh, Amanpreet",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"pages": "1439-1449",
"title": "UniT: Multimodal Multitask Learning With a Unified Transformer",
"type": "InProceedings",
"year": "2021"
},
"Hu_2023_ICCV": {
"author": "Hu, Yushi and Liu, Benlin and Kasai, Jungo and Wang, Yizhong and Ostendorf, Mari and Krishna, Ranjay and Smith, Noah A.",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"pages": "20406-20417",
"title": "TIFA: Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering",
"type": "InProceedings",
"year": "2023"
},
"Hudson_2019_CVPR": {
"author": "Hudson, Drew A. and Manning, Christopher D.",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering",
"type": "InProceedings",
"year": "2019"
},
"Karras_2019_CVPR": {
"author": "Karras, Tero and Laine, Samuli and Aila, Timo",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "A Style-Based Generator Architecture for Generative Adversarial Networks",
"type": "InProceedings",
"year": "2019"
},
"Karras_2020_CVPR": {
"author": "Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo",
"booktitle": "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "Analyzing and Improving the Image Quality of StyleGAN",
"type": "InProceedings",
"year": "2020"
},
"Kirstain2023PickaPicAO": {
"author": "Yuval Kirstain and Adam Polyak and Uriel Singer and Shahbuland Matiana and Joe Penna and Omer Levy",
"journal": "ArXiv",
"title": "Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:258437096",
"volume": "abs/2305.01569",
"year": "2023"
},
"Krishna2016VisualGC": {
"author": "Ranjay Krishna and Yuke Zhu and Oliver Groth and Justin Johnson and Kenji Hata and Joshua Kravitz and Stephanie Chen and Yannis Kalantidis and Li-Jia Li and David A. Shamma and Michael S. Bernstein and Li Fei-Fei",
"journal": "International Journal of Computer Vision",
"pages": "32 - 73",
"title": "Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:4492210",
"volume": "123",
"year": "2016"
},
"Li2023BLIP2BL": {
"author": "Junnan Li and Dongxu Li and Silvio Savarese and Steven C. H. Hoi",
"booktitle": "International Conference on Machine Learning",
"title": "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models",
"type": "inproceedings",
"url": "https://api.semanticscholar.org/CorpusID:256390509",
"year": "2023"
},
"Li_2019_ICCV": {
"author": "Li, Kunpeng and Zhang, Yulun and Li, Kai and Li, Yuanyuan and Fu, Yun",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"title": "Visual Semantic Reasoning for Image-Text Matching",
"type": "InProceedings",
"year": "2019"
},
"Li_2022_CVPR": {
"author": "Li, Zhiheng and Min, Martin Renqiang and Li, Kai and Xu, Chenliang",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "18197-18207",
"title": "StyleT2I: Toward Compositional and High-Fidelity Text-to-Image Synthesis",
"type": "InProceedings",
"year": "2022"
},
"Lin_2023_CVPR": {
"author": "Lin, Chen-Hsuan and Gao, Jun and Tang, Luming and Takikawa, Towaki and Zeng, Xiaohui and Huang, Xun and Kreis, Karsten and Fidler, Sanja and Liu, Ming-Yu and Lin, Tsung-Yi",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "300-309",
"title": "Magic3D: High-Resolution Text-to-3D Content Creation",
"type": "InProceedings",
"year": "2023"
},
"Liu2023VisualIT": {
"author": "Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee",
"journal": "ArXiv",
"title": "Visual Instruction Tuning",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:258179774",
"volume": "abs/2304.08485",
"year": "2023"
},
"Ma_2023_CVPR": {
"author": "Ma, Zixian and Hong, Jerry and Gul, Mustafa Omer and Gandhi, Mona and Gao, Irena and Krishna, Ranjay",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "10910-10921",
"title": "CREPE: Can Vision-Language Foundation Models Reason Compositionally?",
"type": "InProceedings",
"year": "2023"
},
"Metzer_2023_CVPR": {
"author": "Metzer, Gal and Richardson, Elad and Patashnik, Or and Giryes, Raja and Cohen-Or, Daniel",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "12663-12673",
"title": "Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures",
"type": "InProceedings",
"year": "2023"
},
"NEURIPS2019_65699726": {
"author": "Zhou, Sharon and Gordon, Mitchell and Krishna, Ranjay and Narcomey, Austin and Fei-Fei, Li F and Bernstein, Michael",
"booktitle": "Advances in Neural Information Processing Systems",
"editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett",
"pages": "",
"publisher": "Curran Associates, Inc.",
"title": "HYPE: A Benchmark for Human eYe Perceptual Evaluation of Generative Models",
"type": "inproceedings",
"url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/65699726a3c601b9f31bf04019c8593c-Paper.pdf",
"volume": "32",
"year": "2019"
},
"NEURIPS2019_c74d97b0": {
"author": "Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan",
"booktitle": "Advances in Neural Information Processing Systems",
"editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett",
"pages": "",
"publisher": "Curran Associates, Inc.",
"title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
"type": "inproceedings",
"url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/c74d97b01eae257e44aa9d5bade97baf-Paper.pdf",
"volume": "32",
"year": "2019"
},
"NIPS2011_5dd9db5e": {
"author": "Ordonez, Vicente and Kulkarni, Girish and Berg, Tamara",
"booktitle": "Advances in Neural Information Processing Systems",
"editor": "J. Shawe-Taylor and R. Zemel and P. Bartlett and F. Pereira and K.Q. Weinberger",
"pages": "",
"publisher": "Curran Associates, Inc.",
"title": "Im2Text: Describing Images Using 1 Million Captioned Photographs",
"type": "inproceedings",
"url": "https://proceedings.neurips.cc/paper_files/paper/2011/file/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf",
"volume": "24",
"year": "2011"
},
"NIPS2016_8a3363ab": {
"author": "Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi and Chen, Xi",
"booktitle": "Advances in Neural Information Processing Systems",
"editor": "D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett",
"pages": "",
"publisher": "Curran Associates, Inc.",
"title": "Improved Techniques for Training GANs",
"type": "inproceedings",
"url": "https://proceedings.neurips.cc/paper_files/paper/2016/file/8a3363abe792db2d8761d6403605aeb7-Paper.pdf",
"volume": "29",
"year": "2016"
},
"Otani_2023_CVPR": {
"author": "Otani, Mayu and Togashi, Riku and Sawai, Yu and Ishigami, Ryosuke and Nakashima, Yuta and Rahtu, Esa and Heikkil\\\"a, Janne and Satoh, Shin{\\textquoteright}ichi",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "14277-14286",
"title": "Toward Verifiable and Reproducible Human Evaluation for Text-to-Image Generation",
"type": "InProceedings",
"year": "2023"
},
"Rombach_2022_CVPR": {
"author": "Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\\\"orn",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "10684-10695",
"title": "High-Resolution Image Synthesis With Latent Diffusion Models",
"type": "InProceedings",
"year": "2022"
},
"Ruiz_2023_CVPR": {
"author": "Ruiz, Nataniel and Li, Yuanzhen and Jampani, Varun and Pritch, Yael and Rubinstein, Michael and Aberman, Kfir",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "22500-22510",
"title": "DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation",
"type": "InProceedings",
"year": "2023"
},
"Sahin_2024_WACV": {
"author": "Sahin, Ugur and Li, Hang and Khan, Qadeer and Cremers, Daniel and Tresp, Volker",
"booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
"month": "January",
"pages": "5563-5573",
"title": "Enhancing Multimodal Compositional Reasoning of Visual Language Models With Generative Negative Mining",
"type": "InProceedings",
"year": "2024"
},
"Salin_2023_ICCV": {
"author": "Salin, Emmanuelle and Ayache, St\\'ephane and Favre, Benoit",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops",
"month": "October",
"pages": "339-352",
"title": "Towards an Exhaustive Evaluation of Vision-Language Foundation Models",
"type": "InProceedings",
"year": "2023"
},
"Schramowski_2023_CVPR": {
"author": "Schramowski, Patrick and Brack, Manuel and Deiseroth, Bj\\\"orn and Kersting, Kristian",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "22522-22531",
"title": "Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models",
"type": "InProceedings",
"year": "2023"
},
"Simonyan15vgg": {
"author": "\"Karen Simonyan and Andrew Zisserman\",",
"booktitle": "\"International Conference on Learning Representations\",",
"title": "\"Very Deep Convolutional Networks for Large-Scale Image Recognition\",",
"type": "InProceedings",
"year": "\"2015\","
},
"Su2020VL-BERT": {
"author": "Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai",
"booktitle": "International Conference on Learning Representations",
"title": "VL-BERT: Pre-training of Generic Visual-Linguistic Representations",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2020"
},
"Thrush_2022_CVPR": {
"author": "Thrush, Tristan and Jiang, Ryan and Bartolo, Max and Singh, Amanpreet and Williams, Adina and Kiela, Douwe and Ross, Candace",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "5238-5248",
"title": "Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality",
"type": "InProceedings",
"year": "2022"
},
"Tian2022GenerativeAN": {
"author": "Chunwei Tian and Xuanyu Zhang and Chun-Wei Lin and Wangmeng Zuo and Yanning Zhang",
"journal": "ArXiv",
"title": "Generative Adversarial Networks for Image Super-Resolution: A Survey",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:248426817",
"volume": "abs/2204.13620",
"year": "2022"
},
"Tran_2020_ACCV": {
"author": "Tran, Linh Duy and Nguyen, Son Minh and Arai, Masayuki",
"booktitle": "Proceedings of the Asian Conference on Computer Vision (ACCV)",
"month": "November",
"title": "GAN-based Noise Model for Denoising Real Images",
"type": "InProceedings",
"year": "2020"
},
"Vedantam_2015_CVPR": {
"author": "Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi",
"booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "CIDEr: Consensus-Based Image Description Evaluation",
"type": "InProceedings",
"year": "2015"
},
"Wang_2023_CVPR": {
"author": "Wang, Su and Saharia, Chitwan and Montgomery, Ceslee and Pont-Tuset, Jordi and Noy, Shai and Pellegrini, Stefano and Onoe, Yasumasa and Laszlo, Sarah and Fleet, David J. and Soricut, Radu and Baldridge, Jason and Norouzi, Mohammad and Anderson, Peter and Chan, William",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "18359-18369",
"title": "Imagen Editor and EditBench: Advancing and Evaluating Text-Guided Image Inpainting",
"type": "InProceedings",
"year": "2023"
},
"Wu2023HumanPS": {
"author": "Xiaoshi Wu and Yiming Hao and Keqiang Sun and Yixiong Chen and Feng Zhu and Rui Zhao and Hongsheng Li",
"journal": "ArXiv",
"title": "Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:259171771",
"volume": "abs/2306.09341",
"year": "2023"
},
"Wu_2023_ICCV": {
"author": "Wu, Xiaoshi and Sun, Keqiang and Zhu, Feng and Zhao, Rui and Li, Hongsheng",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
"month": "October",
"pages": "2096-2105",
"title": "Human Preference Score: Better Aligning Text-to-Image Models with Human Preference",
"type": "InProceedings",
"year": "2023"
},
"Yarom2023WhatYS": {
"author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran. O. Ofek and Idan Szpektor",
"journal": "ArXiv",
"title": "What You See is What You Read? Improving Text-Image Alignment Evaluation",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:258740893",
"volume": "abs/2305.10400",
"year": "2023"
},
"Zellers_2019_CVPR": {
"author": "Zellers, Rowan and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "From Recognition to Cognition: Visual Commonsense Reasoning",
"type": "InProceedings",
"year": "2019"
},
"Zeng2024IntentTunerAI": {
"author": "Xingchen Zeng and Ziyao Gao and Yilin Ye and Wei Zeng",
"journal": "ArXiv",
"title": "IntentTuner: An Interactive Framework for Integrating Human Intents in Fine-tuning Text-to-Image Generative Models",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:267312299",
"volume": "abs/2401.15559",
"year": "2024"
},
"Zhang2020BERTScore": {
"author": "Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi",
"booktitle": "International Conference on Learning Representations",
"title": "BERTScore: Evaluating Text Generation with BERT",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2020"
},
"Zhang2023CompressA": {
"author": "Lei Zhang and Fangxun Shu and Sucheng Ren and Bingchen Zhao and Hao Jiang and Cihang Xie",
"journal": "ArXiv",
"title": "Compress \\& Align: Curating Image-Text Data with Human Knowledge",
"type": "article",
"url": "https://api.semanticscholar.org/CorpusID:266174263",
"volume": "abs/2312.06726",
"year": "2023"
},
"Zhang_2021_CVPR": {
"author": "Zhang, Pengchuan and Li, Xiujun and Hu, Xiaowei and Yang, Jianwei and Zhang, Lei and Wang, Lijuan and Choi, Yejin and Gao, Jianfeng",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "5579-5588",
"title": "VinVL: Revisiting Visual Representations in Vision-Language Models",
"type": "InProceedings",
"year": "2021"
},
"Zhou_2022_CVPR": {
"author": "Zhou, Xingyi and Koltun, Vladlen and Kr\\\"ahenb\\\"uhl, Philipp",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"pages": "7571-7580",
"title": "Simple Multi-Dataset Detection",
"type": "InProceedings",
"year": "2022"
},
"Zhou_2023_CVPR": {
"author": "Zhou, Yutong and Shimada, Nobutaka",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops",
"month": "June",
"pages": "826-842",
"title": "Vision + Language Applications: A Survey",
"type": "InProceedings",
"year": "2023"
},
"Zhu_2015_ICCV": {
"author": "Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja",
"booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)",
"month": "December",
"title": "Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books",
"type": "InProceedings",
"year": "2015"
},
"Zitnick_2013_CVPR": {
"author": "Zitnick, C. L. and Parikh, Devi",
"booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
"month": "June",
"title": "Bringing Semantics into Focus Using Visual Abstraction",
"type": "InProceedings",
"year": "2013"
},
"ahmad2022new": {
"author": "Ahmad, Waqar and Ali, Hazrat and Shah, Zubair and Azmat, Shoaib",
"journal": "Scientific Reports",
"number": "1",
"pages": "9533",
"publisher": "Nature Publishing Group UK London",
"title": "A new generative adversarial network for medical images super resolution",
"type": "article",
"volume": "12",
"year": "2022"
},
"bai2021mifid": {
"author": "Bai, Ching-Yuan and Lin, Hsuan-Tien and Raffel, Colin and Kan, Wendy Chi-wen",
"booktitle": "Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \\& Data Mining",
"pages": "2534--2542",
"title": "On training sample memorization: Lessons from benchmarking generative modeling with a large-scale competition",
"type": "inproceedings",
"year": "2021"
},
"balanced_vqa_v2": {
"author": "Yash Goyal and Tejas Khot and Douglas Summers{-}Stay and Dhruv Batra and Devi Parikh",
"booktitle": "Conference on Computer Vision and Pattern Recognition (CVPR)",
"title": "Making the {V} in {VQA} Matter: Elevating the Role of Image Understanding in {V}isual {Q}uestion {A}nswering",
"type": "InProceedings",
"year": "2017"
},
"banerjee-lavie-2005-meteor": {
"address": "\"Ann Arbor, Michigan\",",
"author": "\"Banerjee, Satanjeev and Lavie, Alon\",",
"booktitle": "\"Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization\",",
"editor": "\"Goldstein, Jade and Lavie, Alon and Lin, Chin-Yew and Voss, Clare\",",
"month": "jun,",
"pages": "\"65--72\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/W05-0909\",",
"year": "\"2005\","
},
"barratt2018note": {
"author": "Barratt, Shane and Sharma, Rishi",
"journal": "arXiv preprint arXiv:1801.01973",
"title": "A note on the inception score",
"type": "article",
"year": "2018"
},
"baryshnikov2023hypernymy": {
"author": "Baryshnikov, Anton and Ryabinin, Max",
"journal": "arXiv preprint arXiv:2310.09247",
"title": "Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet Hierarchy",
"type": "article",
"year": "2023"
},
"betti2023let": {
"author": "Betti, Federico and Staiano, Jacopo and Baraldi, Lorenzo and Baraldi, Lorenzo and Cucchiara, Rita and Sebe, Nicu",
"booktitle": "Proceedings of the 31st ACM International Conference on Multimedia",
"pages": "9306--9312",
"title": "Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation Evaluation",
"type": "inproceedings",
"year": "2023"
},
"binkowski2018kid": {
"author": "Bi{\\'n}kowski, Miko{\\l}aj and Sutherland, Danica J and Arbel, Michael and Gretton, Arthur",
"booktitle": "International Conference on Learning Representations",
"title": "Demystifying MMD GANs",
"type": "inproceedings",
"year": "2018"
},
"borji2022pros": {
"author": "Borji, Ali",
"journal": "Computer Vision and Image Understanding",
"pages": "103329",
"publisher": "Elsevier",
"title": "Pros and cons of GAN evaluation measures: New developments",
"type": "article",
"volume": "215",
"year": "2022"
},
"brock2018large": {
"author": "Brock, Andrew and Donahue, Jeff and Simonyan, Karen",
"journal": "arXiv preprint arXiv:1809.11096",
"title": "Large scale GAN training for high fidelity natural image synthesis",
"type": "article",
"year": "2018"
},
"brown2020language": {
"author": "Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others",
"journal": "Advances in neural information processing systems",
"pages": "1877--1901",
"title": "Language models are few-shot learners",
"type": "article",
"volume": "33",
"year": "2020"
},
"caron2021emerging": {
"author": "Caron, Mathilde and Touvron, Hugo and Misra, Ishan and J{\\'e}gou, Herv{\\'e} and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand",
"booktitle": "Proceedings of the IEEE/CVF international conference on computer vision",
"pages": "9650--9660",
"title": "Emerging properties in self-supervised vision transformers",
"type": "inproceedings",
"year": "2021"
},
"castro2024clove": {
"author": "Castro, Santiago and Ziai, Amir and Saluja, Avneesh and Yuan, Zhuoning and Mihalcea, Rada",
"journal": "arXiv preprint arXiv:2402.15021",
"title": "CLoVe: Encoding Compositional Language in Contrastive Vision-Language Models",
"type": "article",
"year": "2024"
},
"chambers2010flumejava": {
"abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.",
"address": "New York, NY, USA",
"author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan",
"booktitle": "Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation",
"doi": "10.1145/1806596.1806638",
"isbn": "9781450300193",
"keywords": "data-parallel programming, java, mapreduce",
"location": "Toronto, Ontario, Canada",
"numpages": "13",
"pages": "363\u2013375",
"publisher": "Association for Computing Machinery",
"series": "PLDI '10",
"title": "FlumeJava: easy, efficient data-parallel pipelines",
"type": "inproceedings",
"url": "https://doi.org/10.1145/1806596.1806638",
"year": "2010"
},
"changpinyo2022all": {
"author": "Changpinyo, Soravit and Kukliansky, Doron and Szpektor, Idan and Chen, Xi and Ding, Nan and Soricut, Radu",
"journal": "arXiv preprint arXiv:2205.01883",
"title": "All you may need for vqa are image captions",
"type": "article",
"year": "2022"
},
"chao:iccv2015": {
"author": "Yu-Wei Chao and Zhan Wang and Yugeng He and Jiaxuan Wang and Jia Deng",
"booktitle": "Proceedings of the IEEE International Conference on Computer Vision",
"title": "HICO: A Benchmark for Recognizing Human-Object Interactions in Images",
"type": "INPROCEEDINGS",
"year": "2015"
},
"che2016mode": {
"author": "Che, Tong and Li, Yanran and Jacob, Athul and Bengio, Yoshua and Li, Wenjie",
"booktitle": "International Conference on Learning Representations",
"title": "Mode Regularized Generative Adversarial Networks",
"type": "inproceedings",
"year": "2016"
},
"chefer2023attend": {
"author": "Chefer, Hila and Alaluf, Yuval and Vinker, Yael and Wolf, Lior and Cohen-Or, Daniel",
"journal": "ACM Transactions on Graphics (TOG)",
"number": "4",
"pages": "1--10",
"publisher": "ACM New York, NY, USA",
"title": "Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models",
"type": "article",
"volume": "42",
"year": "2023"
},
"chen2015microsoft": {
"author": "Chen, Xinlei and Fang, Hao and Lin, Tsung-Yi and Vedantam, Ramakrishna and Gupta, Saurabh and Doll{\\'a}r, Piotr and Zitnick, C Lawrence",
"journal": "arXiv preprint arXiv:1504.00325",
"title": "Microsoft coco captions: Data collection and evaluation server",
"type": "article",
"year": "2015"
},
"chen2020uniter": {
"author": "Chen, Yen-Chun and Li, Linjie and Yu, Licheng and El Kholy, Ahmed and Ahmed, Faisal and Gan, Zhe and Cheng, Yu and Liu, Jingjing",
"booktitle": "European conference on computer vision",
"organization": "Springer",
"pages": "104--120",
"title": "Uniter: Universal image-text representation learning",
"type": "inproceedings",
"year": "2020"
},
"chen2022pali": {
"author": "Chen, Xi and Wang, Xiao and Changpinyo, Soravit and Piergiovanni, AJ and Padlewski, Piotr and Salz, Daniel and Goodman, Sebastian and Grycner, Adam and Mustafa, Basil and Beyer, Lucas and others",
"journal": "arXiv preprint arXiv:2209.06794",
"title": "Pali: A jointly-scaled multilingual language-image model",
"type": "article",
"year": "2022"
},
"dash2017tac": {
"author": "Dash, Ayushman and Gamboa, John Cristian Borges and Ahmed, Sheraz and Liwicki, Marcus and Afzal, Muhammad Zeshan",
"journal": "arXiv preprint arXiv:1703.06412",
"title": "Tac-gan-text conditioned auxiliary classifier generative adversarial network",
"type": "article",
"year": "2017"
},
"dehouche2023s": {
"author": "Dehouche, Nassim and Dehouche, Kullathida",
"journal": "Heliyon",
"number": "6",
"publisher": "Elsevier",
"title": "What\u2019s in a text-to-image prompt? The potential of stable diffusion in visual arts education",
"type": "article",
"volume": "9",
"year": "2023"
},
"devlin2018bert": {
"author": "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina",
"journal": "arXiv preprint arXiv:1810.04805",
"title": "Bert: Pre-training of deep bidirectional transformers for language understanding",
"type": "article",
"year": "2018"
},
"dinh2022tise": {
"author": "Dinh, Tan M and Nguyen, Rang and Hua, Binh-Son",
"booktitle": "European Conference on Computer Vision",
"organization": "Springer",
"pages": "594--609",
"title": "TISE: Bag of metrics for text-to-image synthesis evaluation",
"type": "inproceedings",
"year": "2022"
},
"du2007ergas": {
"author": "Du, Qian and Younan, Nicholas H and King, Roger and Shah, Vijay P",
"journal": "IEEE Geoscience and Remote Sensing Letters",
"number": "4",
"pages": "518--522",
"publisher": "IEEE",
"title": "On the performance evaluation of pan-sharpening techniques",
"type": "article",
"volume": "4",
"year": "2007"
},
"dunlap2023describing": {
"author": "Dunlap, Lisa and Zhang, Yuhui and Wang, Xiaohan and Zhong, Ruiqi and Darrell, Trevor and Steinhardt, Jacob and Gonzalez, Joseph E and Yeung-Levy, Serena",
"journal": "arXiv preprint arXiv:2312.02974",
"title": "Describing Differences in Image Sets with Natural Language",
"type": "article",
"year": "2023"
},
"faghri2018vse++": {
"author": "Faghri, Fartash and Fleet, David J and Kiros, Jamie Ryan and Fidler, Sanja",
"booktitle": "Proceedings of the British Machine Vision Conference ({BMVC})",
"title": "VSE\\+\\+: Improving Visual-Semantic Embeddings with Hard Negatives",
"type": "article",
"url": "https://github.com/fartashf/vsepp",
"year": "2018"
},
"feng2023trainingfree": {
"author": "Weixi Feng and Xuehai He and Tsu-Jui Fu and Varun Jampani and Arjun Reddy Akula and Pradyumna Narayana and Sugato Basu and Xin Eric Wang and William Yang Wang",
"booktitle": "The Eleventh International Conference on Learning Representations ",
"title": "Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"fu2023dreamsim": {
"author": "Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip",
"journal": "arXiv preprint arXiv:2306.09344",
"title": "DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data",
"type": "article",
"year": "2023"
},
"gal2022clipdirsim": {
"author": "Gal, Rinon and Patashnik, Or and Maron, Haggai and Bermano, Amit H and Chechik, Gal and Cohen-Or, Daniel",
"journal": "ACM Transactions on Graphics (TOG)",
"number": "4",
"pages": "1--13",
"publisher": "ACM New York, NY, USA",
"title": "StyleGAN-NADA: CLIP-guided domain adaptation of image generators",
"type": "article",
"volume": "41",
"year": "2022"
},
"gan2020large": {
"author": "Gan, Zhe and Chen, Yen-Chun and Li, Linjie and Zhu, Chen and Cheng, Yu and Liu, Jingjing",
"journal": "Advances in Neural Information Processing Systems",
"pages": "6616--6628",
"title": "Large-scale adversarial training for vision-and-language representation learning",
"type": "article",
"volume": "33",
"year": "2020"
},
"gordon2023mismatch": {
"archiveprefix": "arXiv",
"author": "Brian Gordon and Yonatan Bitton and Yonatan Shafir and Roopal Garg and Xi Chen and Dani Lischinski and Daniel Cohen-Or and Idan Szpektor",
"eprint": "2312.03766",
"primaryclass": "cs.CL",
"title": "Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment",
"type": "misc",
"year": "2023"
},
"gretton2006kernel": {
"author": "Gretton, Arthur and Borgwardt, Karsten and Rasch, Malte and Sch{\\\"o}lkopf, Bernhard and Smola, Alex",
"journal": "Advances in neural information processing systems",
"title": "A kernel method for the two-sample-problem",
"type": "article",
"volume": "19",
"year": "2006"
},
"gu2020giqa": {
"author": "Gu, Shuyang and Bao, Jianmin and Chen, Dong and Wen, Fang",
"booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XI 16",
"organization": "Springer",
"pages": "369--385",
"title": "Giqa: Generated image quality assessment",
"type": "inproceedings",
"year": "2020"
},
"gu2023automated": {
"author": "Gu, Siqi",
"journal": "arXiv preprint arXiv:2312.12933",
"title": "Automated Testing for Text-to-Image Software",
"type": "article",
"year": "2023"
},
"hartwig2022learning": {
"abstract": "Abstract View quality measures compute scores for given views and are used to determine an optimal view in viewpoint selection tasks. Unfortunately, despite the wide adoption of these measures, they are rather based on computational quantities, such as entropy, than human preferences. To instead tailor viewpoint measures towards humans, view quality measures need to be able to capture human viewpoint preferences. Therefore, we introduce a large-scale crowdsourced data set, which contains 58k annotated viewpoints for 3220 ModelNet40 models. Based on this data, we derive a neural view quality measure abiding to human preferences. We further demonstrate that this view quality measure not only generalizes to models unseen during training, but also to unseen model categories. We are thus able to predict view qualities for single images, and directly predict human preferred viewpoints for 3D models by exploiting point-based learning technology, without requiring to generate intermediate images or sampling the view sphere. We will detail our data collection procedure, describe the data analysis and model training and will evaluate the predictive quality of our trained viewpoint measure on unseen models and categories. To our knowledge, this is the first deep learning approach to predict a view quality measure solely based on human preferences.",
"author": "Hartwig, S. and Schelling, M. and Onzenoodt, C. v. and V\u00e1zquez, P.-P. and Hermosilla, P. and Ropinski, T.",
"doi": "https://doi.org/10.1111/cgf.14613",
"eprint": "https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.14613",
"journal": "Computer Graphics Forum",
"keywords": "user studies, interaction, perceptually-based rendering, rendering",
"number": "6",
"pages": "453-466",
"title": "Learning Human Viewpoint Preferences from Sparsely Annotated Models",
"type": "article",
"url": "https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.14613",
"volume": "41",
"year": "2022"
},
"he2016deep": {
"author": "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian",
"booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
"pages": "770--778",
"title": "Deep residual learning for image recognition",
"type": "inproceedings",
"year": "2016"
},
"hessel2021clipscore": {
"author": "Hessel, Jack and Holtzman, Ari and Forbes, Maxwell and Bras, Ronan Le and Choi, Yejin",
"journal": "arXiv preprint arXiv:2104.08718",
"title": "Clipscore: A reference-free evaluation metric for image captioning",
"type": "article",
"year": "2021"
},
"hinz2020semantic": {
"author": "Hinz, Tobias and Heinrich, Stefan and Wermter, Stefan",
"journal": "IEEE transactions on pattern analysis and machine intelligence",
"number": "3",
"pages": "1552--1565",
"publisher": "IEEE",
"title": "Semantic object accuracy for generative text-to-image synthesis",
"type": "article",
"volume": "44",
"year": "2020"
},
"ho2022imagen": {
"archiveprefix": "arXiv",
"author": "Jonathan Ho and William Chan and Chitwan Saharia and Jay Whang and Ruiqi Gao and Alexey Gritsenko and Diederik P. Kingma and Ben Poole and Mohammad Norouzi and David J. Fleet and Tim Salimans",
"eprint": "2210.02303",
"primaryclass": "cs.CV",
"title": "Imagen Video: High Definition Video Generation with Diffusion Models",
"type": "misc",
"year": "2022"
},
"ho2022video": {
"archiveprefix": "arXiv",
"author": "Jonathan Ho and Tim Salimans and Alexey Gritsenko and William Chan and Mohammad Norouzi and David J. Fleet",
"eprint": "2204.03458",
"primaryclass": "cs.CV",
"title": "Video Diffusion Models",
"type": "misc",
"year": "2022"
},
"hochreiter1997lstm": {
"author": "Hochreiter, Sepp and Schmidhuber, J{\\\"u}rgen",
"journal": "Neural computation",
"number": "8",
"pages": "1735--1780",
"publisher": "MIT press",
"title": "Long short-term memory",
"type": "article",
"volume": "9",
"year": "1997"
},
"hodosh2013framing": {
"author": "Hodosh, Micah and Young, Peter and Hockenmaier, Julia",
"journal": "Journal of Artificial Intelligence Research",
"pages": "853--899",
"title": "Framing image description as a ranking task: Data, models and evaluation metrics",
"type": "article",
"volume": "47",
"year": "2013"
},
"honnibal2017spacy": {
"author": "Honnibal, Matthew and Montani, Ines",
"journal": "To appear",
"number": "1",
"pages": "411--420",
"title": "spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing",
"type": "article",
"volume": "7",
"year": "2017"
},
"hsieh2023sugarcrepe": {
"author": "Cheng-Yu Hsieh and Jieyu Zhang and Zixian Ma and Aniruddha Kembhavi and Ranjay Krishna",
"booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
"title": "SugarCrepe: Fixing Hackable Benchmarks for Vision-Language Compositionality",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"huang2023t2icompbench": {
"author": "Kaiyi Huang and Kaiyue Sun and Enze Xie and Zhenguo Li and Xihui Liu",
"journal": "arXiv preprint arXiv: 2307.06350",
"title": "T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation",
"type": "article",
"year": "2023"
},
"ijcai2022p759": {
"author": "Cao, Min and Li, Shiping and Li, Juntao and Nie, Liqiang and Zhang, Min",
"booktitle": "Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}",
"doi": "10.24963/ijcai.2022/759",
"editor": "Lud De Raedt",
"month": "7",
"note": "Survey Track",
"pages": "5410--5417",
"publisher": "International Joint Conferences on Artificial Intelligence Organization",
"title": "Image-text Retrieval: A Survey on Recent Research and Development",
"type": "inproceedings",
"url": "https://doi.org/10.24963/ijcai.2022/759",
"year": "2022"
},
"jenkins2019unsupervised": {
"author": "Jenkins, Porter and Farag, Ahmad and Wang, Suhang and Li, Zhenhui",
"booktitle": "Proceedings of the 28th ACM international conference on information and knowledge management",
"pages": "1993--2002",
"title": "Unsupervised representation learning of spatial data via multimodal embedding",
"type": "inproceedings",
"year": "2019"
},
"jiang2019tiger": {
"author": "Jiang, Ming and Huang, Qiuyuan and Zhang, Lei and Wang, Xin and Zhang, Pengchuan and Gan, Zhe and Diesner, Jana and Gao, Jianfeng",
"journal": "arXiv preprint arXiv:1909.02050",
"title": "Tiger: Text-to-image grounding for image caption evaluation",
"type": "article",
"year": "2019"
},
"kamath-etal-2023-text": {
"address": "\"Singapore\",",
"author": "\"Kamath, Amita and Hessel, Jack and Chang, Kai-Wei\",",
"booktitle": "\"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing\",",
"doi": "\"10.18653/v1/2023.emnlp-main.301\",",
"editor": "\"Bouamor, Houda and Pino, Juan and Bali, Kalika\",",
"month": "dec,",
"pages": "\"4933--4944\"",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"Text encoders bottleneck compositionality in contrastive vision-language models\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2023.emnlp-main.301\",",
"year": "\"2023\","
},
"kamath2023s": {
"author": "Kamath, Amita and Hessel, Jack and Chang, Kai-Wei",
"journal": "arXiv preprint arXiv:2310.19785",
"title": "What's\" up\" with vision-language models? Investigating their struggle with spatial reasoning",
"type": "article",
"year": "2023"
},
"kane-etal-2020-nubia": {
"abstract": "\"We present NUBIA, a methodology to build automatic evaluation metrics for text generation using only machine learning models as core components. A typical NUBIA model is composed of three modules: a neural feature extractor, an aggregator and a calibrator. We demonstrate an implementation of NUBIA showing competitive performance with stateof-the art metrics used to evaluate machine translation and state-of-the art results for image captions quality evaluation. In addition to strong performance, NUBIA models have the advantage of being modular and improve in synergy with advances in text generation models.\",",
"address": "\"Online (Dublin, Ireland)\",",
"author": "\"Kane, Hassan and Kocyigit, Muhammed Yusuf and Abdalla, Ali and Ajanoh, Pelkins and Coulibali, Mohamed\",",
"booktitle": "\"Proceedings of the 1st Workshop on Evaluating NLG Evaluation\",",
"editor": "\"Agarwal, Shubham and Du{\\v{s}}ek, Ond{\\v{r}}ej and Gehrmann, Sebastian and Gkatzia, Dimitra and Konstas, Ioannis and Van Miltenburg, Emiel and Santhanam, Sashank\",",
"month": "dec,",
"pages": "\"28--37\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{NUBIA}: {N}e{U}ral Based Interchangeability Assessor for Text Generation\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2020.evalnlgeval-1.4\",",
"year": "\"2020\","
},
"kang2023scaling": {
"author": "Kang, Minguk and Zhu, Jun-Yan and Zhang, Richard and Park, Jaesik and Shechtman, Eli and Paris, Sylvain and Park, Taesung",
"booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition",
"pages": "10124--10134",
"title": "Scaling up gans for text-to-image synthesis",
"type": "inproceedings",
"year": "2023"
},
"karras2019style": {
"author": "Karras, Tero and Laine, Samuli and Aila, Timo",
"booktitle": "Proceedings of the IEEE/CVF conference on computer vision and pattern recognition",
"pages": "4401--4410",
"title": "A style-based generator architecture for generative adversarial networks",
"type": "inproceedings",
"year": "2019"
},
"khashabi2020unifiedqa": {
"author": "Khashabi, Daniel and Min, Sewon and Khot, Tushar and Sabharwal, Ashish and Tafjord, Oyvind and Clark, Peter and Hajishirzi, Hannaneh",
"journal": "arXiv preprint arXiv:2005.00700",
"title": "Unifiedqa: Crossing format boundaries with a single qa system",
"type": "article",
"year": "2020"
},
"kim2021vilt": {
"author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo",
"booktitle": "International Conference on Machine Learning",
"organization": "PMLR",
"pages": "5583--5594",
"title": "Vilt: Vision-and-language transformer without convolution or region supervision",
"type": "inproceedings",
"year": "2021"
},
"kim2022mutual": {
"author": "Kim, Jin-Hwa and Kim, Yunji and Lee, Jiyoung and Yoo, Kang Min and Lee, Sang-Woo",
"journal": "Advances in Neural Information Processing Systems",
"pages": "35072--35086",
"title": "Mutual information divergence: A unified metric for multimodal generative models",
"type": "article",
"volume": "35",
"year": "2022"
},
"kim2023imagine": {
"author": "Yeongbin Kim and Gautam Singh and Junyeong Park and Caglar Gulcehre and Sungjin Ahn",
"booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
"title": "Imagine the Unseen World: A Benchmark for Systematic Generalization in Visual World Models",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"kiros2014unifying": {
"author": "Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S",
"journal": "arXiv preprint arXiv:1411.2539",
"title": "Unifying visual-semantic embeddings with multimodal neural language models",
"type": "article",
"year": "2014"
},
"kiros2018illustrative": {
"author": "Kiros, Jamie and Chan, William and Hinton, Geoffrey",
"booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
"pages": "922--933",
"title": "Illustrative language understanding: Large-scale visual grounding with image search",
"type": "inproceedings",
"year": "2018"
},
"ku2023viescore": {
"author": "Ku, Max and Jiang, Dongfu and Wei, Cong and Yue, Xiang and Chen, Wenhu",
"journal": "arXiv preprint arXiv:2312.14867",
"title": "VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation",
"type": "article",
"year": "2023"
},
"kumar2023comprehensive": {
"author": "Kumar, Satyam and Musharaf, Dayima and Musharaf, Seerat and Sagar, Anil Kumar",
"booktitle": "International Conference on Advanced Communication and Intelligent Systems",
"organization": "Springer",
"pages": "90--103",
"title": "A Comprehensive Review of the Latest Advancements in Large Generative AI Models",
"type": "inproceedings",
"year": "2023"
},
"kuznetsova2020open": {
"author": "Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and others",
"journal": "International Journal of Computer Vision",
"number": "7",
"pages": "1956--1981",
"publisher": "Springer",
"title": "The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale",
"type": "article",
"volume": "128",
"year": "2020"
},
"kynkaanniemi2019improved": {
"author": "Kynk{\\\"a}{\\\"a}nniemi, Tuomas and Karras, Tero and Laine, Samuli and Lehtinen, Jaakko and Aila, Timo",
"journal": "Advances in Neural Information Processing Systems",
"title": "Improved precision and recall metric for assessing generative models",
"type": "article",
"volume": "32",
"year": "2019"
},
"lavie2004significance": {
"author": "Lavie, Alon and Sagae, Kenji and Jayaraman, Shyamsundar",
"booktitle": "Machine Translation: From Real Users to Research: 6th Conference of the Association for Machine Translation in the Americas, AMTA 2004, Washington, DC, USA, September 28-October 2, 2004. Proceedings 6",
"organization": "Springer",
"pages": "134--143",
"title": "The significance of recall in automatic metrics for MT evaluation",
"type": "inproceedings",
"year": "2004"
},
"lee-etal-2020-vilbertscore": {
"address": "\"Online\",",
"author": "\"Lee, Hwanhee and Yoon, Seunghyun and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Jung, Kyomin\",",
"booktitle": "\"Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems\",",
"doi": "\"10.18653/v1/2020.eval4nlp-1.4\",",
"editor": "\"Eger, Steffen and Gao, Yang and Peyrard, Maxime and Zhao, Wei and Hovy, Eduard\",",
"month": "nov,",
"pages": "\"34--39\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{V}i{LBERTS}core: Evaluating Image Caption Using Vision-and-Language {BERT}\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2020.eval4nlp-1.4\",",
"year": "\"2020\","
},
"lee2018stacked": {
"author": "Lee, Kuang-Huei and Chen, Xi and Hua, Gang and Hu, Houdong and He, Xiaodong",
"booktitle": "Proceedings of the European conference on computer vision (ECCV)",
"pages": "201--216",
"title": "Stacked cross attention for image-text matching",
"type": "inproceedings",
"year": "2018"
},
"lee2023aligning": {
"archiveprefix": "arXiv",
"author": "Kimin Lee and Hao Liu and Moonkyung Ryu and Olivia Watkins and Yuqing Du and Craig Boutilier and Pieter Abbeel and Mohammad Ghavamzadeh and Shixiang Shane Gu",
"eprint": "2302.12192",
"primaryclass": "cs.LG",
"title": "Aligning Text-to-Image Models using Human Feedback",
"type": "misc",
"year": "2023"
},
"lee2023holistic": {
"author": "Tony Lee and Michihiro Yasunaga and Chenlin Meng and Yifan Mai and Joon Sung Park and Agrim Gupta and Yunzhi Zhang and Deepak Narayanan and Hannah Benita Teufel and Marco Bellagente and Minguk Kang and Taesung Park and Jure Leskovec and Jun-Yan Zhu and Li Fei-Fei and Jiajun Wu and Stefano Ermon and Percy Liang",
"booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
"title": "Holistic Evaluation of Text-to-Image Models",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"lee2023text": {
"author": "Lee, Jaewoong and Jang, Sangwon and Jo, Jaehyeong and Yoon, Jaehong and Kim, Yunji and Kim, Jin-Hwa and Ha, Jung-Woo and Hwang, Sung Ju",
"journal": "arXiv preprint arXiv:2304.01515",
"title": "Text-Conditioned Sampling Framework for Text-to-Image Generation with Masked Generative Models",
"type": "article",
"year": "2023"
},
"lewis2022does": {
"author": "Lewis, Martha and Nayak, Nihal V and Yu, Peilin and Yu, Qinan and Merullo, Jack and Bach, Stephen H and Pavlick, Ellie",
"journal": "arXiv preprint arXiv:2212.10537",
"title": "Does clip bind concepts? probing compositionality in large image models",
"type": "article",
"year": "2022"
},
"li-etal-2022-mplug": {
"abstract": "\"Large-scale pre-trained foundation models have been an emerging paradigm for building artificial intelligence (AI) systems, which can be quickly adapted to a wide range of downstream tasks. This paper presents mPLUG, a new vision-language foundation model for both cross-modal understanding and generation. Most existing pre-trained models suffer from inefficiency and linguistic signal overwhelmed by long visual sequences in cross-modal alignment. To address both problems, mPLUG introduces an effective and efficient vision-language architecture with novel cross-modal skip-connections.mPLUG is pre-trained end-to-end on large-scale image-text pairs with both discriminative and generative objectives. It achieves state-of-the-art results on a wide range of vision-language downstream tasks, including image captioning, image-text retrieval, visual grounding and visual question answering. mPLUG also demonstrates strong zero-shot transferability on vision-language and video-language tasks. The code and pre-trained models are available at \\url{https://github.com/alibaba/AliceMind}\",",
"address": "\"Abu Dhabi, United Arab Emirates\",",
"author": "\"Li, Chenliang and Xu, Haiyang and Tian, Junfeng and Wang, Wei and Yan, Ming and Bi, Bin and Ye, Jiabo and Chen, He and Xu, Guohai and Cao, Zheng and Zhang, Ji and Huang, Songfang and Huang, Fei and Zhou, Jingren and Si, Luo\",",
"booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing\",",
"doi": "\"10.18653/v1/2022.emnlp-main.488\",",
"editor": "\"Goldberg, Yoav and Kozareva, Zornitsa and Zhang, Yue\",",
"month": "dec,",
"pages": "\"7241--7259\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"m{PLUG}: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2022.emnlp-main.488\",",
"year": "\"2022\","
},
"li2020oscar": {
"author": "Li, Xiujun and Yin, Xi and Li, Chunyuan and Zhang, Pengchuan and Hu, Xiaowei and Zhang, Lei and Wang, Lijuan and Hu, Houdong and Dong, Li and Wei, Furu and others",
"booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16",
"organization": "Springer",
"pages": "121--137",
"title": "Oscar: Object-semantics aligned pre-training for vision-language tasks",
"type": "inproceedings",
"year": "2020"
},
"li2022blip": {
"author": "Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven",
"booktitle": "International Conference on Machine Learning",
"organization": "PMLR",
"pages": "12888--12900",
"title": "Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation",
"type": "inproceedings",
"year": "2022"
},
"li2023covlm": {
"author": "Li, Junyan and Chen, Delin and Hong, Yining and Chen, Zhenfang and Chen, Peihao and Shen, Yikang and Gan, Chuang",
"journal": "arXiv preprint arXiv:2311.03354",
"title": "CoVLM: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding",
"type": "article",
"year": "2023"
},
"li2023divide": {
"author": "Li, Yumeng and Keuper, Margret and Zhang, Dan and Khoreva, Anna",
"journal": "arXiv preprint arXiv:2307.10864",
"title": "Divide \\& bind your attention for improved generative semantic nursing",
"type": "article",
"year": "2023"
},
"li2024compositional": {
"author": "Junyan Li and Delin Chen and Yining Hong and Zhenfang Chen and Peihao Chen and Yikang Shen and Chuang Gan",
"booktitle": "The Twelfth International Conference on Learning Representations",
"title": "Compositional {VLM}: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2024"
},
"liang2020cpgan": {
"author": "Liang, Jiadong and Pei, Wenjie and Lu, Feng",
"booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16",
"organization": "Springer",
"pages": "491--508",
"title": "Cpgan: Content-parsing generative adversarial networks for text-to-image synthesis",
"type": "inproceedings",
"year": "2020"
},
"liang2023rich": {
"author": "Liang, Youwei and He, Junfeng and Li, Gang and Li, Peizhao and Klimovskiy, Arseniy and Carolan, Nicholas and Sun, Jiao and Pont-Tuset, Jordi and Young, Sarah and Yang, Feng and others",
"journal": "arXiv preprint arXiv:2312.10240",
"title": "Rich Human Feedback for Text-to-Image Generation",
"type": "article",
"year": "2023"
},
"lin-2004-rouge": {
"address": "\"Barcelona, Spain\",",
"author": "\"Lin, Chin-Yew\",",
"booktitle": "\"Text Summarization Branches Out\",",
"month": "jul,",
"pages": "\"74--81\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{ROUGE}: A Package for Automatic Evaluation of Summaries\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/W04-1013\",",
"year": "\"2004\","
},
"lin2014microsoft": {
"author": "Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\\'a}r, Piotr and Zitnick, C Lawrence",
"booktitle": "Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13",
"organization": "Springer",
"pages": "740--755",
"title": "Microsoft coco: Common objects in context",
"type": "inproceedings",
"year": "2014"
},
"lin2023revisiting": {
"author": "Lin, Zhiqiu and Chen, Xinyue and Pathak, Deepak and Zhang, Pengchuan and Ramanan, Deva",
"journal": "arXiv preprint arXiv:2306.01879",
"title": "Revisiting the Role of Language Priors in Vision-Language Models",
"type": "article",
"year": "2023"
},
"liu2021swin": {
"author": "Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining",
"booktitle": "Proceedings of the IEEE/CVF international conference on computer vision",
"pages": "10012--10022",
"title": "Swin transformer: Hierarchical vision transformer using shifted windows",
"type": "inproceedings",
"year": "2021"
},
"liu2022compositional": {
"author": "Liu, Nan and Li, Shuang and Du, Yilun and Torralba, Antonio and Tenenbaum, Joshua B",
"booktitle": "European Conference on Computer Vision",
"organization": "Springer",
"pages": "423--439",
"title": "Compositional visual generation with composable diffusion models",
"type": "inproceedings",
"year": "2022"
},
"liu2024fetv": {
"author": "Liu, Yuanxin and Li, Lei and Ren, Shuhuai and Gao, Rundong and Li, Shicheng and Chen, Sishuo and Sun, Xu and Hou, Lu",
"journal": "Advances in Neural Information Processing Systems",
"title": "Fetv: A benchmark for fine-grained evaluation of open-domain text-to-video generation",
"type": "article",
"volume": "36",
"year": "2024"
},
"lopez2016revisiting": {
"author": "Lopez-Paz, David and Oquab, Maxime",
"booktitle": "International Conference on Learning Representations",
"title": "Revisiting Classifier Two-Sample Tests",
"type": "inproceedings",
"year": "2016"
},
"lu2023llmscore": {
"author": "Yujie Lu and Xianjun Yang and Xiujun Li and Xin Eric Wang and William Yang Wang",
"booktitle": "Thirty-seventh Conference on Neural Information Processing Systems",
"title": "LLMScore: Unveiling the Power of Large Language Models in Text-to-Image Synthesis Evaluation",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"ma2023examination": {
"author": "Ma, Teli and Li, Rong and Liang, Junwei",
"journal": "arXiv preprint arXiv:2308.10509",
"title": "An examination of the compositionality of large generative vision-language models",
"type": "article",
"year": "2023"
},
"ma2024cobra": {
"author": "Ma, Zheng and Wang, Changxin and Ouyang, Yawen and Zhao, Fei and Zhang, Jianbing and Huang, Shujian and Chen, Jiajun",
"journal": "arXiv preprint arXiv:2402.11572",
"title": "Cobra Effect in Reference-Free Image Captioning Metrics",
"type": "article",
"year": "2024"
},
"madhyastha-etal-2019-vifidel": {
"address": "\"Florence, Italy\",",
"author": "\"Madhyastha, Pranava and Wang, Josiah and Specia, Lucia\",",
"booktitle": "\"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",",
"doi": "\"10.18653/v1/P19-1654\",",
"editor": "\"Korhonen, Anna and Traum, David and M{\\`a}rquez, Llu{\\'\\i}s\",",
"month": "jul,",
"pages": "\"6539--6550\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{VIFIDEL}: Evaluating the Visual Fidelity of Image Descriptions\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/P19-1654\",",
"year": "\"2019\","
},
"mao2016training": {
"abstract": "In this paper, we focus on training and evaluating effective word embeddings with both text and visual information. More specifically, we introduce a large-scale dataset with 300 million sentences describing over 40 million images crawled and downloaded from publicly available Pins (i.e. an image with sentence descriptions uploaded by users) on Pinterest [2]. This dataset is more than 200 times larger than MS COCO [22], the standard large-scale image dataset with sentence descriptions. In addition, we construct an evaluation dataset to directly assess the effectiveness of word embeddings in terms of finding semantically similar or related words and phrases. The word/phrase pairs in this evaluation dataset are collected from the click data with millions of users in an image search system, thus contain rich semantic relationships. Based on these datasets, we propose and compare several Recurrent Neural Networks (RNNs) based multimodal (text and image) models. Experiments show that our model benefits from incorporating the visual information into the word embeddings, and a weight sharing strategy is crucial for learning such multimodal embeddings. The project page is: http://www.stat.ucla.edu/~junhua.mao/multimodal_embedding.html.",
"address": "Red Hook, NY, USA",
"author": "Mao, Junhua and Xu, Jiajing and Jing, Yushi and Yuille, Alan",
"booktitle": "Proceedings of the 30th International Conference on Neural Information Processing Systems",
"isbn": "9781510838819",
"location": "Barcelona, Spain",
"numpages": "9",
"pages": "442\u2013450",
"publisher": "Curran Associates Inc.",
"series": "NIPS'16",
"title": "Training and evaluating multimodal word embeddings with large-scale web annotated images",
"type": "inproceedings",
"year": "2016"
},
"marcus2022preliminary": {
"archiveprefix": "arXiv",
"author": "Gary Marcus and Ernest Davis and Scott Aaronson",
"eprint": "2204.13807",
"primaryclass": "cs.CV",
"title": "A very preliminary analysis of DALL-E 2",
"type": "misc",
"year": "2022"
},
"menendez1997jensen": {
"abstract": "In this paper we investigate the Jensen-Shannon parametric divergence for testing goodness-of-fit for point estimation. Most of the work presented is an analytical study of the asymptotic differences between different members of the family proposed in goodness of fit, together with an examination of closer approximations to the exact distribution of these statistics than the commonly used chi-squared distribution. Finally the minimum Jensen-Shannon divergence estimates are introduced and compared with other well-known estimators by computer simulation.",
"author": "M.L. Men\u00e9ndez and J.A. Pardo and L. Pardo and M.C. Pardo",
"doi": "https://doi.org/10.1016/S0016-0032(96)00063-4",
"issn": "0016-0032",
"journal": "Journal of the Franklin Institute",
"number": "2",
"pages": "307-318",
"title": "The Jensen-Shannon divergence",
"type": "article",
"url": "https://www.sciencedirect.com/science/article/pii/S0016003296000634",
"volume": "334",
"year": "1997"
},
"miller1995Wordnet": {
"abstract": "Because meaningful sentences are composed of meaningful words, any system that hopes to process natural languages as people do must have information about words and their meanings. This information is traditionally provided through dictionaries, and machine-readable dictionaries are now widely available. But dictionary entries evolved for the convenience of human readers, not for machines. WordNet1 provides a more effective combination of traditional lexicographic information and modern computing. WordNet is an online lexical database designed for use under program control. English nouns, verbs, adjectives, and adverbs are organized into sets of synonyms, each representing a lexicalized concept. Semantic relations link the synonym sets [4].",
"address": "New York, NY, USA",
"author": "Miller, George A.",
"doi": "10.1145/219717.219748",
"issn": "0001-0782",
"issue_date": "Nov. 1995",
"journal": "Commun. ACM",
"month": "nov",
"number": "11",
"numpages": "3",
"pages": "39\u201341",
"publisher": "Association for Computing Machinery",
"title": "WordNet: a lexical database for English",
"type": "article",
"url": "https://doi.org/10.1145/219717.219748",
"volume": "38",
"year": "1995"
},
"minderer2022simple": {
"author": "Minderer, M and Gritsenko, A and Stone, A and Neumann, M and Weissenborn, D and Dosovitskiy, A and Mahendran, A and Arnab, A and Dehghani, M and Shen, Z and others",
"journal": "arXiv preprint arXiv:2205.06230",
"title": "Simple open-vocabulary object detection with vision transformers. arxiv 2022",
"type": "article",
"volume": "2",
"year": "2022"
},
"moorthy2011blind": {
"author": "Moorthy, Anush Krishna and Bovik, Alan Conrad",
"journal": "IEEE transactions on Image Processing",
"number": "12",
"pages": "3350--3364",
"publisher": "IEEE",
"title": "Blind image quality assessment: From natural scene statistics to perceptual quality",
"type": "article",
"volume": "20",
"year": "2011"
},
"openai2023gpt": {
"author": "OpenAI, R",
"journal": "View in Article",
"pages": "13",
"title": "Gpt-4 technical report. arxiv 2303.08774",
"type": "article",
"volume": "2",
"year": "2023"
},
"park2021benchmark": {
"author": "Dong Huk Park and Samaneh Azadi and Xihui Liu and Trevor Darrell and Anna Rohrbach",
"booktitle": "Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)",
"title": "Benchmark for Compositional Text-to-Image Synthesis",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2021"
},
"pascal-voc-2008": {
"author": "\"Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.\",",
"howpublished": "\"http://www.pascal-network.org/challenges/VOC/voc2008/workshop/index.html\"",
"title": "\"The {PASCAL} {V}isual {O}bject {C}lasses {C}hallenge 2008 {(VOC2008)} {R}esults\",",
"type": "misc"
},
"plummer2015flickr30kentities": {
"author": "Plummer, Bryan A. and Wang, Liwei and Cervantes, Chris M. and Caicedo, Juan C. and Hockenmaier, Julia and Lazebnik, Svetlana",
"booktitle": "2015 IEEE International Conference on Computer Vision (ICCV)",
"doi": "10.1109/ICCV.2015.303",
"keywords": "Standards;Benchmark testing;Image resolution;Grounding;Glass;Training;Image color analysis",
"number": "",
"pages": "2641-2649",
"title": "Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models",
"type": "INPROCEEDINGS",
"volume": "",
"year": "2015"
},
"pmlr-v139-cho21a": {
"abstract": "\t {Existing methods for vision-and-language learning typically require designing task-specific architectures and objectives for each task. For example, a multi-label answer classifier for visual question answering, a region scorer for referring expression comprehension, and a language decoder for image captioning, etc. To alleviate these hassles, in this work, we propose a unified framework that learns different tasks in a single architecture with the same language modeling objective, i.e., multimodal conditional text generation, where our models learn to generate labels in text based on the visual and textual inputs. On 7 popular vision-and-language benchmarks, including visual question answering, referring expression comprehension, visual commonsense reasoning, most of which have been previously modeled as discriminative tasks, our generative approach (with a single unified architecture) reaches comparable performance to recent task-specific state-of-the-art vision-and-language models. Moreover, our generative approach shows better generalization ability on questions that have rare answers. Also, we show that our framework allows multi-task learning in a single architecture with a single set of parameters, achieving similar performance to separately optimized single-task models. Our code is publicly available at: https://github.com/j-min/VL-T5",
"author": "Cho, Jaemin and Lei, Jie and Tan, Hao and Bansal, Mohit",
"booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning",
"editor": "\t {Meila, Marina and Zhang, Tong",
"month": "\t {18--24 Jul",
"pages": "\t {1931--1942",
"pdf": "\t {http://proceedings.mlr.press/v139/cho21a/cho21a.pdf",
"publisher": "PMLR",
"series": "\t {Proceedings of Machine Learning Research",
"title": "\t {Unifying Vision-and-Language Tasks via Text Generation",
"type": "InProceedings",
"url": "\t {https://proceedings.mlr.press/v139/cho21a.html",
"volume": "\t {139",
"year": "\t {2021"
},
"pmlr-v139-kim21k": {
"abstract": "\t {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.",
"author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo",
"booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning",
"editor": "\t {Meila, Marina and Zhang, Tong",
"month": "\t {18--24 Jul",
"pages": "\t {5583--5594",
"pdf": "\t {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf",
"publisher": "PMLR",
"series": "\t {Proceedings of Machine Learning Research",
"title": "\t {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision",
"type": "InProceedings",
"url": "\t {http://proceedings.mlr.press/v139/kim21k.html",
"volume": "\t {139",
"year": "\t {2021"
},
"pmlr-v37-kusnerb15": {
"abstract": "\t {We present the Word Mover\u2019s Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local co-occurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to \"travel\" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover\u2019s Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.",
"address": "\t {Lille, France",
"author": "\t {Kusner, Matt and Sun, Yu and Kolkin, Nicholas and Weinberger, Kilian",
"booktitle": "\t {Proceedings of the 32nd International Conference on Machine Learning",
"editor": "\t {Bach, Francis and Blei, David",
"month": "\t {07--09 Jul",
"pages": "\t {957--966",
"pdf": "\t {http://proceedings.mlr.press/v37/kusnerb15.pdf",
"publisher": "PMLR",
"series": "\t {Proceedings of Machine Learning Research",
"title": "\t {From Word Embeddings To Document Distances",
"type": "InProceedings",
"url": "\t {https://proceedings.mlr.press/v37/kusnerb15.html",
"volume": "\t {37",
"year": "\t {2015"
},
"po2023state": {
"archiveprefix": "arXiv",
"author": "Ryan Po and Wang Yifan and Vladislav Golyanik and Kfir Aberman and Jonathan T. Barron and Amit H. Bermano and Eric Ryan Chan and Tali Dekel and Aleksander Holynski and Angjoo Kanazawa and C. Karen Liu and Lingjie Liu and Ben Mildenhall and Matthias Nie\u00dfner and Bj\u00f6rn Ommer and Christian Theobalt and Peter Wonka and Gordon Wetzstein",
"eprint": "2310.07204",
"primaryclass": "cs.AI",
"title": "State of the Art on Diffusion Models for Visual Computing",
"type": "misc",
"year": "2023"
},
"prabhudesai2023aligning": {
"archiveprefix": "arXiv",
"author": "Mihir Prabhudesai and Anirudh Goyal and Deepak Pathak and Katerina Fragkiadaki",
"eprint": "2310.03739",
"primaryclass": "cs.CV",
"title": "Aligning Text-to-Image Diffusion Models with Reward Backpropagation",
"type": "misc",
"year": "2023"
},
"radford2019language": {
"author": "Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others",
"journal": "OpenAI blog",
"number": "8",
"pages": "9",
"title": "Language models are unsupervised multitask learners",
"type": "article",
"volume": "1",
"year": "2019"
},
"radford2021learning": {
"author": "Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others",
"booktitle": "International conference on machine learning",
"organization": "PMLR",
"pages": "8748--8763",
"title": "Learning transferable visual models from natural language supervision",
"type": "inproceedings",
"year": "2021"
},
"ramesh2021zero": {
"author": "Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya",
"booktitle": "International conference on machine learning",
"organization": "Pmlr",
"pages": "8821--8831",
"title": "Zero-shot text-to-image generation",
"type": "inproceedings",
"year": "2021"
},
"rashtchian2010collecting": {
"author": "Rashtchian, Cyrus and Young, Peter and Hodosh, Micah and Hockenmaier, Julia",
"booktitle": "Proceedings of the NAACL HLT 2010 workshop on creating speech and language data with Amazon\u2019s Mechanical Turk",
"pages": "139--147",
"title": "Collecting image annotations using amazon\u2019s mechanical turk",
"type": "inproceedings",
"year": "2010"
},
"ravuri2019classification": {
"author": "Ravuri, Suman and Vinyals, Oriol",
"journal": "Advances in neural information processing systems",
"title": "Classification accuracy score for conditional generative models",
"type": "article",
"volume": "32",
"year": "2019"
},
"ray2024cola": {
"author": "Ray, Arijit and Radenovic, Filip and Dubey, Abhimanyu and Plummer, Bryan and Krishna, Ranjay and Saenko, Kate",
"journal": "Advances in Neural Information Processing Systems",
"title": "cola: A Benchmark for Compositional Text-to-image Retrieval",
"type": "article",
"volume": "36",
"year": "2024"
},
"reed2016generative": {
"author": "Reed, Scott and Akata, Zeynep and Yan, Xinchen and Logeswaran, Lajanugen and Schiele, Bernt and Lee, Honglak",
"booktitle": "International conference on machine learning",
"organization": "PMLR",
"pages": "1060--1069",
"title": "Generative adversarial text to image synthesis",
"type": "inproceedings",
"year": "2016"
},
"reed2016learning": {
"author": "Reed, Scott E and Akata, Zeynep and Mohan, Santosh and Tenka, Samuel and Schiele, Bernt and Lee, Honglak",
"journal": "Advances in neural information processing systems",
"title": "Learning what and where to draw",
"type": "article",
"volume": "29",
"year": "2016"
},
"reis2023real": {
"author": "Reis, Dillon and Kupec, Jordan and Hong, Jacqueline and Daoudi, Ahmad",
"journal": "arXiv preprint arXiv:2305.09972",
"title": "Real-time flying object detection with YOLOv8",
"type": "article",
"year": "2023"
},
"roberts2022scaling": {
"archiveprefix": "arXiv",
"author": "Adam Roberts and Hyung Won Chung and Anselm Levskaya and Gaurav Mishra and James Bradbury and Daniel Andor and Sharan Narang and Brian Lester and Colin Gaffney and Afroz Mohiuddin and Curtis Hawthorne and Aitor Lewkowycz and Alex Salcianu and Marc van Zee and Jacob Austin and Sebastian Goodman and Livio Baldini Soares and Haitang Hu and Sasha Tsvyashchenko and Aakanksha Chowdhery and Jasmijn Bastings and Jannis Bulian and Xavier Garcia and Jianmo Ni and Andrew Chen and Kathleen Kenealy and Jonathan H. Clark and Stephan Lee and Dan Garrette and James Lee-Thorp and Colin Raffel and Noam Shazeer and Marvin Ritter and Maarten Bosma and Alexandre Passos and Jeremy Maitin-Shepard and Noah Fiedel and Mark Omernick and Brennan Saeta and Ryan Sepassi and Alexander Spiridonov and Joshua Newlan and Andrea Gesmundo",
"eprint": "2203.17189",
"primaryclass": "cs.LG",
"title": "Scaling Up Models and Data with $\\texttt{t5x}$ and $\\texttt{seqio}$",
"type": "misc",
"year": "2022"
},
"rohrbach2016movie": {
"archiveprefix": "arXiv",
"author": "Anna Rohrbach and Atousa Torabi and Marcus Rohrbach and Niket Tandon and Christopher Pal and Hugo Larochelle and Aaron Courville and Bernt Schiele",
"eprint": "1605.03705",
"primaryclass": "cs.CV",
"title": "Movie Description",
"type": "misc",
"year": "2016"
},
"saharia2022photorealistic": {
"author": "Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily L and Ghasemipour, Kamyar and Gontijo Lopes, Raphael and Karagol Ayan, Burcu and Salimans, Tim and others",
"journal": "Advances in Neural Information Processing Systems",
"pages": "36479--36494",
"title": "Photorealistic text-to-image diffusion models with deep language understanding",
"type": "article",
"volume": "35",
"year": "2022"
},
"sajjadi2018assessing": {
"author": "Sajjadi, Mehdi SM and Bachem, Olivier and Lucic, Mario and Bousquet, Olivier and Gelly, Sylvain",
"journal": "Advances in neural information processing systems",
"title": "Assessing generative models via precision and recall",
"type": "article",
"volume": "31",
"year": "2018"
},
"schuhmann2022laion": {
"author": "Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others",
"journal": "Advances in Neural Information Processing Systems",
"pages": "25278--25294",
"title": "Laion-5b: An open large-scale dataset for training next generation image-text models",
"type": "article",
"volume": "35",
"year": "2022"
},
"sharma2018conceptual": {
"author": "Sharma, Piyush and Ding, Nan and Goodman, Sebastian and Soricut, Radu",
"booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
"pages": "2556--2565",
"title": "Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning",
"type": "inproceedings",
"year": "2018"
},
"singh2023coarse": {
"author": "Singh, Harman and Zhang, Pengchuan and Wang, Qifan and Wang, Mengjiao and Xiong, Wenhan and Du, Jingfei and Chen, Yu",
"journal": "arXiv preprint arXiv:2305.13812",
"title": "Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for Improved Vision-Language Compositionality",
"type": "article",
"year": "2023"
},
"singh2023divide": {
"author": "Singh, Jaskirat and Zheng, Liang",
"journal": "arXiv preprint arXiv:2307.04749",
"title": "Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback",
"type": "article",
"year": "2023"
},
"suhr2017corpus": {
"author": "Suhr, Alane and Lewis, Mike and Yeh, James and Artzi, Yoav",
"booktitle": "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
"pages": "217--223",
"title": "A corpus of natural language for visual reasoning",
"type": "inproceedings",
"year": "2017"
},
"szegedy2015going": {
"author": "Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew",
"booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
"pages": "1--9",
"title": "Going deeper with convolutions",
"type": "inproceedings",
"year": "2015"
},
"szegedy2016inception": {
"author": "Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew",
"booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
"pages": "2818--2826",
"title": "Rethinking the inception architecture for computer vision",
"type": "inproceedings",
"year": "2016"
},
"tan2019lxmert": {
"author": "Tan, Hao and Bansal, Mohit",
"journal": "arXiv preprint arXiv:1908.07490",
"title": "Lxmert: Learning cross-modality encoder representations from transformers",
"type": "article",
"year": "2019"
},
"wang-etal-2023-diffusiondb": {
"abstract": "\"With recent advancements in diffusion models, users can generate high-quality images by writing text prompts in natural language. However, generating images with desired details requires proper prompts, and it is often unclear how a model reacts to different prompts or what the best prompts are. To help researchers tackle these critical challenges, we introduce DiffusionDB, the first large-scale text-to-image prompt dataset totaling 6.5TB, containing 14 million images generated by Stable Diffusion, 1.8 million unique prompts, and hyperparameters specified by real users. We analyze the syntactic and semantic characteristics of prompts. We pinpoint specific hyperparameter values and prompt styles that can lead to model errors and present evidence of potentially harmful model usage, such as the generation of misinformation. The unprecedented scale and diversity of this human-actuated dataset provide exciting research opportunities in understanding the interplay between prompts and generative models, detecting deepfakes, and designing human-AI interaction tools to help users more easily use these models. DiffusionDB is publicly available at: \\url{https://poloclub.github.io/diffusiondb}.\",",
"address": "\"Toronto, Canada\",",
"author": "\"Wang, Zijie J. and Montoya, Evan and Munechika, David and Yang, Haoyang and Hoover, Benjamin and Chau, Duen Horng\",",
"booktitle": "\"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\",",
"doi": "\"10.18653/v1/2023.acl-long.51\",",
"editor": "\"Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki\",",
"month": "jul,",
"pages": "\"893--911\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{D}iffusion{DB}: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2023.acl-long.51\",",
"year": "\"2023\","
},
"wang-gaizauskas-2015-generating": {
"address": "\"Brighton, UK\",",
"author": "\"Wang, Josiah and Gaizauskas, Robert\",",
"booktitle": "\"Proceedings of the 15th {E}uropean Workshop on Natural Language Generation ({ENLG})\",",
"doi": "\"10.18653/v1/W15-4722\",",
"editor": "\"Belz, Anya and Gatt, Albert and Portet, Fran{\\c{c}}ois and Purver, Matthew\",",
"month": "sep,",
"pages": "\"117--126\",",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"Generating Image Descriptions with Gold Standard Visual Inputs: Motivation, Evaluation and Baselines\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/W15-4722\",",
"year": "\"2015\","
},
"wang2003multiscale": {
"author": "Wang, Zhou and Simoncelli, Eero P and Bovik, Alan C",
"booktitle": "The Thrity-Seventh Asilomar Conference on Signals, Systems \\& Computers, 2003",
"organization": "Ieee",
"pages": "1398--1402",
"title": "Multiscale structural similarity for image quality assessment",
"type": "inproceedings",
"volume": "2",
"year": "2003"
},
"wang2004image": {
"author": "Wang, Zhou and Bovik, Alan C and Sheikh, Hamid R and Simoncelli, Eero P",
"journal": "IEEE transactions on image processing",
"number": "4",
"pages": "600--612",
"publisher": "IEEE",
"title": "Image quality assessment: from error visibility to structural similarity",
"type": "article",
"volume": "13",
"year": "2004"
},
"wang2023clipiqa": {
"author": "Wang, Jianyi and Chan, Kelvin CK and Loy, Chen Change",
"booktitle": "Proceedings of the AAAI Conference on Artificial Intelligence",
"number": "2",
"pages": "2555--2563",
"title": "Exploring clip for assessing the look and feel of images",
"type": "inproceedings",
"volume": "37",
"year": "2023"
},
"wolff2023the": {
"author": "Max Wolff and Wieland Brendel and Stuart Wolff",
"booktitle": "ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models",
"title": "The Independent Compositional Subspace Hypothesis for the Structure of {CLIP}'s Last Layer",
"type": "inproceedings",
"url": "https://openreview.net/forum?id",
"year": "2023"
},
"wu2022grit": {
"author": "Wu, Jialian and Wang, Jianfeng and Yang, Zhengyuan and Gan, Zhe and Liu, Zicheng and Yuan, Junsong and Wang, Lijuan",
"journal": "arXiv preprint arXiv:2212.00280",
"title": "Grit: A generative region-to-text transformer for object understanding",
"type": "article",
"year": "2022"
},
"xie2019visual": {
"archiveprefix": "arXiv",
"author": "Ning Xie and Farley Lai and Derek Doran and Asim Kadav",
"eprint": "1811.10582",
"primaryclass": "cs.CV",
"title": "Visual Entailment Task for Visually-Grounded Language Learning",
"type": "misc",
"year": "2019"
},
"xu-etal-2023-metarevision": {
"address": "\"Singapore\",",
"author": "\"Xu, Guangyue and Kordjamshidi, Parisa and Chai, Joyce\",",
"booktitle": "\"Findings of the Association for Computational Linguistics: EMNLP 2023\",",
"doi": "\"10.18653/v1/2023.findings-emnlp.818\",",
"editor": "\"Bouamor, Houda and Pino, Juan and Bali, Kalika\",",
"month": "dec,",
"pages": "\"12224--12236\"",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"{M}eta{R}e{V}ision: Meta-Learning with Retrieval for Visually Grounded Compositional Concept Acquisition\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2023.findings-emnlp.818\",",
"year": "\"2023\","
},
"xu2023imagereward": {
"author": "Xu, Jiazheng and Liu, Xiao and Wu, Yuchen and Tong, Yuxuan and Li, Qinkai and Ding, Ming and Tang, Jie and Dong, Yuxiao",
"journal": "arXiv preprint arXiv:2304.05977",
"title": "Imagereward: Learning and evaluating human preferences for text-to-image generation",
"type": "article",
"year": "2023"
},
"yang2023diffusion": {
"author": "Yang, Ling and Zhang, Zhilong and Song, Yang and Hong, Shenda and Xu, Runsheng and Zhao, Yue and Zhang, Wentao and Cui, Bin and Yang, Ming-Hsuan",
"journal": "ACM Computing Surveys",
"number": "4",
"pages": "1--39",
"publisher": "ACM New York, NY, USA",
"title": "Diffusion models: A comprehensive survey of methods and applications",
"type": "article",
"volume": "56",
"year": "2023"
},
"yarom2023seetrue": {
"archiveprefix": "arXiv",
"author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran Ofek and Idan Szpektor",
"eprint": "2305.10400",
"primaryclass": "cs.CL",
"title": "What You See is What You Read? Improving Text-Image Alignment Evaluation",
"type": "misc",
"year": "2023"
},
"young2014flickr30k": {
"abstract": "\"We propose to use the visual denotations of linguistic expressions (i.e. the set of images they describe) to define novel denotational similarity metrics, which we show to be at least as beneficial as distributional similarities for two tasks that require semantic inference. To compute these denotational similarities, we construct a denotation graph, i.e. a subsumption hierarchy over constituents and their denotations, based on a large corpus of 30K images and 150K descriptive captions.\",",
"address": "\"Cambridge, MA\",",
"author": "\"Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia\",",
"doi": "\"10.1162/tacl_a_00166\",",
"editor": "\"Lin, Dekang and Collins, Michael and Lee, Lillian\",",
"journal": "\"Transactions of the Association for Computational Linguistics\",",
"pages": "\"67--78\",",
"publisher": "\"MIT Press\",",
"title": "\"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions\",",
"type": "article",
"url": "\"https://aclanthology.org/Q14-1006\",",
"volume": "\"2\",",
"year": "\"2014\","
},
"young2014image": {
"author": "Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia",
"journal": "Transactions of the Association for Computational Linguistics",
"pages": "67--78",
"publisher": "MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~\u2026",
"title": "From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions",
"type": "article",
"volume": "2",
"year": "2014"
},
"yuksekgonul2022and": {
"author": "Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James",
"booktitle": "The Eleventh International Conference on Learning Representations",
"title": "When and Why Vision-Language Models Behave like Bags-Of-Words, and What to Do About It?",
"type": "inproceedings",
"year": "2022"
},
"zelaszczyk2024text": {
"author": "{\\.Z}elaszczyk, Maciej and Ma{\\'n}dziuk, Jacek",
"journal": "arXiv preprint arXiv:2401.11631",
"title": "Text-to-Image Cross-Modal Generation: A Systematic Review",
"type": "article",
"year": "2024"
},
"zhang2011fsim": {
"author": "Zhang, Lin and Zhang, Lei and Mou, Xuanqin and Zhang, David",
"journal": "IEEE transactions on Image Processing",
"number": "8",
"pages": "2378--2386",
"publisher": "IEEE",
"title": "FSIM: A feature similarity index for image quality assessment",
"type": "article",
"volume": "20",
"year": "2011"
},
"zhang2018unreasonable": {
"author": "Zhang, Richard and Isola, Phillip and Efros, Alexei A and Shechtman, Eli and Wang, Oliver",
"booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
"pages": "586--595",
"title": "The unreasonable effectiveness of deep features as a perceptual metric",
"type": "inproceedings",
"year": "2018"
},
"zhang2022perceptual": {
"author": "Zhang, Lingzhi and Zhou, Yuqian and Barnes, Connelly and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo",
"booktitle": "European Conference on Computer Vision",
"organization": "Springer",
"pages": "146--164",
"title": "Perceptual artifacts localization for inpainting",
"type": "inproceedings",
"year": "2022"
},
"zhang2023perceptual": {
"author": "Zhang, Lingzhi and Xu, Zhengjie and Barnes, Connelly and Zhou, Yuqian and Liu, Qing and Zhang, He and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo",
"booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision",
"pages": "7579--7590",
"title": "Perceptual Artifacts Localization for Image Synthesis Tasks",
"type": "inproceedings",
"year": "2023"
},
"zhang2023spot": {
"author": "Zhang, Gengyuan and Bi, Jinhe and Gu, Jindong and Tresp, Volker",
"journal": "arXiv preprint arXiv:2311.12919",
"title": "SPOT! Revisiting Video-Language Models for Event Understanding",
"type": "article",
"year": "2023"
},
"zhang2023texttoimage": {
"archiveprefix": "arXiv",
"author": "Chenshuang Zhang and Chaoning Zhang and Mengchun Zhang and In So Kweon",
"eprint": "2303.07909",
"primaryclass": "cs.CV",
"title": "Text-to-image Diffusion Models in Generative AI: A Survey",
"type": "misc",
"year": "2023"
},
"zhao-etal-2022-explainable": {
"address": "\"Abu Dhabi, UAE\",",
"author": "\"Zhao, Tiancheng and Zhang, Tianqi and Zhu, Mingwei and Shen, Haozhan and Lee, Kyusong and Lu, Xiaopeng and Yin, Jianwei\",",
"booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\",",
"doi": "\"10.18653/v1/2022.emnlp-demos.4\",",
"editor": "\"Che, Wanxiang and Shutova, Ekaterina\",",
"month": "dec,",
"pages": "\"30--37\"",
"publisher": "\"Association for Computational Linguistics\",",
"title": "\"An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models\",",
"type": "inproceedings",
"url": "\"https://aclanthology.org/2022.emnlp-demos.4\",",
"year": "\"2022\","
},
"zhu2023contrastive": {
"author": "Zhu, Xiangru and Sun, Penglei and Wang, Chengyu and Liu, Jingping and Li, Zhixu and Xiao, Yanghua and Huang, Jun",
"journal": "arXiv preprint arXiv:2312.02338",
"title": "A Contrastive Compositional Benchmark for Text-to-Image Synthesis: A Study with Unified Text-to-Image Fidelity Metrics",
"type": "article",
"year": "2023"
}
}; |