File size: 127,059 Bytes
d736789
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
const generatedBibEntries = {
    "10.1007/978-3-319-46454-1_24": {
        "abstract": "\"There is considerable interest in the task of automatically generating image captions. However, evaluation is challenging. Existing automatic evaluation metrics are primarily sensitive to n-gram overlap, which is neither necessary nor sufficient for the task of simulating human judgment. We hypothesize that semantic propositional content is an important component of human caption evaluation, and propose a new automated caption evaluation metric defined over scene graphs coined SPICE. Extensive evaluations across a range of models and datasets indicate that SPICE captures human judgments over model-generated captions better than other automatic metrics (e.g., system-level correlation of 0.88 with human judgments on the MS COCO dataset, versus 0.43 for CIDEr and 0.53 for METEOR). Furthermore, SPICE can answer questions such as which caption-generator best understands colors? and can caption-generators count?\",",
        "address": "\"Cham\",",
        "author": "\"Anderson, Peter and Fernando, Basura and Johnson, Mark and Gould, Stephen\",",
        "booktitle": "\"Computer Vision -- ECCV 2016\",",
        "editor": "\"Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max\",",
        "isbn": "\"978-3-319-46454-1\"",
        "pages": "\"382--398\",",
        "publisher": "\"Springer International Publishing\",",
        "title": "\"SPICE: Semantic Propositional Image Caption Evaluation\",",
        "type": "InProceedings",
        "year": "\"2016\","
    },
    "10.1016/j.neunet.2021.07.019": {
        "address": "GBR",
        "author": "Frolov, Stanislav and Hinz, Tobias and Raue, Federico and Hees, J\\\"{o}rn and Dengel, Andreas",
        "doi": "10.1016/j.neunet.2021.07.019",
        "issn": "0893-6080",
        "issue_date": "Dec 2021",
        "journal": "Neural Netw.",
        "keywords": "Generative adversarial networks, Text-to-image synthesis",
        "month": "dec",
        "number": "C",
        "numpages": "23",
        "pages": "187\u2013209",
        "publisher": "Elsevier Science Ltd.",
        "title": "Adversarial text-to-image synthesis: A review",
        "type": "article",
        "url": "https://doi.org/10.1016/j.neunet.2021.07.019",
        "volume": "144",
        "year": "2021"
    },
    "10.1145/1809028.1806638": {
        "abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.",
        "address": "New York, NY, USA",
        "author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan",
        "doi": "10.1145/1809028.1806638",
        "issn": "0362-1340",
        "issue_date": "June 2010",
        "journal": "SIGPLAN Not.",
        "keywords": "data-parallel programming, java, mapreduce",
        "month": "jun",
        "number": "6",
        "numpages": "13",
        "pages": "363\u2013375",
        "publisher": "Association for Computing Machinery",
        "title": "FlumeJava: easy, efficient data-parallel pipelines",
        "type": "article",
        "url": "https://doi.org/10.1145/1809028.1806638",
        "volume": "45",
        "year": "2010"
    },
    "10.1145/3461353.3461388": {
        "abstract": "In recent years, deep learning technology has made breakthroughs in computer vision. After using large-scale data training, the deep neural network represented by GAN is significantly better than previous technologies in image generation, including generating more reasonable, higher-definition, more complex, and more accurate images. With the continuous development of datasets, models, and applications, the fusion of different modal information, including fusion of natural language, semantic layouts, tags, edge maps, and other different modal information, to generate images has become a new demand and challenge. There are related reviews on image generation and multimodal deep learning. However, there has not been a review dedicated to multimodal deep-learning image generation to discuss the current status, existing problems, and challenges of this task. Therefore, this review proposes a survey on multimodal deep learning image generation. It aims to provide readers with an application scenario for multimodal deep learning image generation. Also, it provides readers with new multimodal deep learning image generation technologies, the relevant datasets, evaluation metrics used, and some results comparison. Finally, this article describes some of the challenges and future topics of multimodal deep learning image generation.",
        "address": "New York, NY, USA",
        "author": "Luo, Sanbi",
        "booktitle": "Proceedings of the 2021 5th International Conference on Innovation in Artificial Intelligence",
        "doi": "10.1145/3461353.3461388",
        "isbn": "9781450388634",
        "keywords": "multimodal, machine learning, image synthesis, deep learning, computer vision",
        "location": "Xia men, China",
        "numpages": "13",
        "pages": "108\u2013120",
        "publisher": "Association for Computing Machinery",
        "series": "ICIAI '21",
        "title": "A Survey on Multimodal Deep Learning for Image Synthesis: Applications, methods, datasets, evaluation metrics, and results comparison",
        "type": "inproceedings",
        "url": "https://doi.org/10.1145/3461353.3461388",
        "year": "2021"
    },
    "10.3115/1073083.1073135": {
        "abstract": "Human evaluations of machine translation are extensive but expensive. Human evaluations can take months to finish and involve human labor that can not be reused. We propose a method of automatic machine translation evaluation that is quick, inexpensive, and language-independent, that correlates highly with human evaluation, and that has little marginal cost per run. We present this method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations.",
        "address": "USA",
        "author": "Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing",
        "booktitle": "Proceedings of the 40th Annual Meeting on Association for Computational Linguistics",
        "doi": "10.3115/1073083.1073135",
        "location": "Philadelphia, Pennsylvania",
        "numpages": "8",
        "pages": "311\u2013318",
        "publisher": "Association for Computational Linguistics",
        "series": "ACL '02",
        "title": "BLEU: a method for automatic evaluation of machine translation",
        "type": "inproceedings",
        "url": "https://doi.org/10.3115/1073083.1073135",
        "year": "2002"
    },
    "10.5555/3295222.3295408": {
        "address": "Red Hook, NY, USA",
        "author": "Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp",
        "isbn": "9781510860964",
        "location": "Long Beach, California, USA",
        "numpages": "12",
        "pages": "6629\u20136640",
        "publisher": "Curran Associates Inc.",
        "series": "NIPS'17",
        "title": "GANs trained by a two time-scale update rule converge to a local nash equilibrium",
        "type": "inproceedings",
        "year": "2017"
    },
    "10081412": {
        "author": "Croitoru, Florinel-Alin and Hondru, Vlad and Ionescu, Radu Tudor and Shah, Mubarak",
        "doi": "10.1109/TPAMI.2023.3261988",
        "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
        "keywords": "Computational modeling;Mathematical models;Noise reduction;Data models;Computer vision;Training;Task analysis;Denoising diffusion models;deep generative modeling;diffusion models;image generation;noise conditioned score networks;score-based models",
        "number": "9",
        "pages": "10850-10869",
        "title": "Diffusion Models in Vision: A Survey",
        "type": "ARTICLE",
        "volume": "45",
        "year": "2023"
    },
    "10123038": {
        "author": "Xu, Peng and Zhu, Xiatian and Clifton, David A.",
        "doi": "10.1109/TPAMI.2023.3275156",
        "journal": "IEEE Transactions on Pattern Analysis and Machine Intelligence",
        "keywords": "Transformers;Task analysis;Surveys;Visualization;Taxonomy;Mathematical models;Data models;Multimodal learning;transformer;introductory;taxonomy;deep learning;machine learning",
        "number": "10",
        "pages": "12113-12132",
        "title": "Multimodal Learning With Transformers: A Survey",
        "type": "ARTICLE",
        "volume": "45",
        "year": "2023"
    },
    "10218041": {
        "author": "Abdulghanni, Sara Faez and Abdulmunem, Ashwan A.",
        "booktitle": "2023 Al-Sadiq International Conference on Communication and Information Technology (AICCIT)",
        "doi": "10.1109/AICCIT57614.2023.10218041",
        "keywords": "Measurement;Training;Surveys;Deep learning;Image synthesis;Transforms;Stability analysis;Image generation;Diffusion model;Stable diffusion;Generative models;Deep learning",
        "number": "",
        "pages": "171-175",
        "title": "Image Generation Conditioned on Text Using Deep Learning Models: Survey",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2023"
    },
    "4176674": {
        "author": "Dolan, Brian",
        "booktitle": "2006 Fortieth Asilomar Conference on Signals, Systems and Computers",
        "doi": "10.1109/ACSSC.2006.354864",
        "keywords": "Mammography;Computer vision;Design automation;Humans;Medical diagnostic imaging;Breast cancer;Protocols;Biopsy;Gold;Visualization",
        "number": "",
        "pages": "821-825",
        "title": "Computer Aided Diagnosis in Mammography: Its Development and Early Challenges",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2006"
    },
    "5412098": {
        "author": "Thung, Kim-Han and Raveendran, Paramesran",
        "booktitle": "2009 International Conference for Technical Postgraduates (TECHPOS)",
        "doi": "10.1109/TECHPOS.2009.5412098",
        "keywords": "Image quality;Digital images;Signal processing;Image coding;Image storage;Signal processing algorithms;Noise reduction;Video compression;PSNR;Gaussian noise",
        "number": "",
        "pages": "1-4",
        "title": "A survey of image quality measures",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2009"
    },
    "5430991": {
        "author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe",
        "doi": "10.1109/LSP.2010.2045550",
        "journal": "IEEE Signal Processing Letters",
        "keywords": "Discrete cosine transforms;Image quality;Statistics;Feature extraction;Layout;Distortion measurement;Support vector machines;Testing;Machine learning algorithms;Machine learning;Anisotropy;discrete cosine transform;kurtosis;natural scene statistics;no-reference quality assessment",
        "number": "6",
        "pages": "583-586",
        "title": "A DCT Statistics-Based Blind Image Quality Index",
        "type": "ARTICLE",
        "volume": "17",
        "year": "2010"
    },
    "5596999": {
        "author": "Hor\u00e9, Alain and Ziou, Djemel",
        "booktitle": "2010 20th International Conference on Pattern Recognition",
        "doi": "10.1109/ICPR.2010.579",
        "keywords": "PSNR;Degradation;Image quality;Additives;Transform coding;Sensitivity;Image coding;PSNR;SSIM;image quality metrics",
        "number": "",
        "pages": "2366-2369",
        "title": "Image Quality Metrics: PSNR vs. SSIM",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2010"
    },
    "5995446": {
        "author": "Tang, Huixuan and Joshi, Neel and Kapoor, Ashish",
        "booktitle": "CVPR 2011",
        "doi": "10.1109/CVPR.2011.5995446",
        "keywords": "Image quality;Distortion measurement;Transform coding;Histograms;Noise;Kernel;Degradation",
        "number": "",
        "pages": "305-312",
        "title": "Learning a blind measure of perceptual image quality",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2011"
    },
    "6165361": {
        "author": "Ye, Peng and Doermann, David",
        "doi": "10.1109/TIP.2012.2190086",
        "journal": "IEEE Transactions on Image Processing",
        "keywords": "Feature extraction;Training;Image quality;Visualization;Transform coding;Databases;Image coding;Gabor filter;no-reference image quality assessment (NRIQA);texture analysis;visual codebook",
        "number": "7",
        "pages": "3129-3138",
        "title": "No-Reference Image Quality Assessment Using Visual Codebooks",
        "type": "ARTICLE",
        "volume": "21",
        "year": "2012"
    },
    "6172573": {
        "author": "Saad, Michele A. and Bovik, Alan C. and Charrier, Christophe",
        "doi": "10.1109/TIP.2012.2191563",
        "journal": "IEEE Transactions on Image Processing",
        "keywords": "Discrete cosine transforms;Feature extraction;Visualization;Humans;Computational modeling;Predictive models;Image quality;Discrete cosine transform (DCT);generalized Gaussian density;natural scene statistics;no-reference image quality assessment",
        "number": "8",
        "pages": "3339-3352",
        "title": "Blind Image Quality Assessment: A Natural Scene Statistics Approach in the DCT Domain",
        "type": "ARTICLE",
        "volume": "21",
        "year": "2012"
    },
    "6190099": {
        "author": "Mittal, Anish and Moorthy, Anush K. and Bovik, Alan C.",
        "booktitle": "2011 Conference Record of the Forty Fifth Asilomar Conference on Signals, Systems and Computers (ASILOMAR)",
        "doi": "10.1109/ACSSC.2011.6190099",
        "keywords": "Humans;Image quality;Transform coding;Correlation;Databases;Measurement;Computational modeling",
        "number": "",
        "pages": "723-727",
        "title": "Blind/Referenceless Image Spatial Quality Evaluator",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2011"
    },
    "6353522": {
        "author": "Mittal, Anish and Soundararajan, Rajiv and Bovik, Alan C.",
        "doi": "10.1109/LSP.2012.2227726",
        "journal": "IEEE Signal Processing Letters",
        "keywords": "Image quality;Image processing;Statistical analysis;Feature extraction;Completely blind;distortion free;image quality assessment;no reference",
        "number": "3",
        "pages": "209-212",
        "title": "Making a \u201cCompletely Blind\u201d Image Quality Analyzer",
        "type": "ARTICLE",
        "volume": "20",
        "year": "2013"
    },
    "7084843": {
        "author": "Venkatanath N and Praneeth D and Maruthi Chandrasekhar Bh and Channappayya, Sumohana S. and Medasani, Swarup S.",
        "booktitle": "2015 Twenty First National Conference on Communications (NCC)",
        "doi": "10.1109/NCC.2015.7084843",
        "keywords": "Image quality;Noise;Databases;Feature extraction;Standards;Transform coding;Image segmentation;No reference image quality assessment;spatial activity;Perceptual quality",
        "number": "",
        "pages": "1-6",
        "title": "Blind image quality evaluation using perception based features",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2015"
    },
    "8195348": {
        "author": "Wu, Xian and Xu, Kun and Hall, Peter",
        "doi": "10.23919/TST.2017.8195348",
        "journal": "Tsinghua Science and Technology",
        "keywords": "Gallium nitride;Image generation;Generators;Image resolution;Feature extraction;Training;Linear programming;image synthesis;image editing;constrained image synthesis;generative adversarial networks;image-to-image translation",
        "number": "6",
        "pages": "660-674",
        "title": "A survey of image synthesis and editing with generative adversarial networks",
        "type": "ARTICLE",
        "volume": "22",
        "year": "2017"
    },
    "8578241": {
        "address": "Los Alamitos, CA, USA",
        "author": "T. Xu and P. Zhang and Q. Huang and H. Zhang and Z. Gan and X. Huang and X. He",
        "booktitle": "2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "doi": "10.1109/CVPR.2018.00143",
        "issn": "",
        "keywords": "gallium nitride;generative adversarial networks;computational modeling;image generation;generators;semantics;visualization",
        "month": "jun",
        "pages": "1316-1324",
        "publisher": "IEEE Computer Society",
        "title": "AttnGAN: Fine-Grained Text to Image Generation with Attentional Generative Adversarial Networks",
        "type": "INPROCEEDINGS",
        "url": "https://doi.ieeecomputersociety.org/10.1109/CVPR.2018.00143",
        "volume": "",
        "year": "2018"
    },
    "9495208": {
        "author": "Xia, Jiazhi and Lin, Weixing and Jiang, Guang and Wang, Yunhai and Chen, Wei and Schreck, Tobias",
        "doi": "10.1109/MCG.2021.3098804",
        "journal": "IEEE Computer Graphics and Applications",
        "keywords": "Visualization;Shape analysis;Visual perception;Clustering algorithms;Deep learning;Splines (mathematics)",
        "number": "5",
        "pages": "79-89",
        "title": "Visual Clustering Factors in Scatterplots",
        "type": "ARTICLE",
        "volume": "41",
        "year": "2021"
    },
    "AUTOMATIC1111_Stable_Diffusion_Web_2022": {
        "author": "AUTOMATIC1111",
        "month": "aug,",
        "title": "Stable Diffusion Web UI",
        "type": "software",
        "url": "https://github.com/AUTOMATIC1111/stable-diffusion-webui",
        "year": "2022"
    },
    "Agrawal_2019_ICCV": {
        "author": "Agrawal, Harsh and Desai, Karan and Wang, Yufei and Chen, Xinlei and Jain, Rishabh and Johnson, Mark and Batra, Dhruv and Parikh, Devi and Lee, Stefan and Anderson, Peter",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "title": "nocaps: novel object captioning at scale",
        "type": "InProceedings",
        "year": "2019"
    },
    "Antol_2015_ICCV": {
        "author": "Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C. Lawrence and Parikh, Devi",
        "booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)",
        "month": "December",
        "title": "VQA: Visual Question Answering",
        "type": "InProceedings",
        "year": "2015"
    },
    "BahdanauCB14": {
        "author": "Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio",
        "bibsource": "dblp computer science bibliography, https://dblp.org",
        "biburl": "https://dblp.org/rec/journals/corr/BahdanauCB14.bib",
        "booktitle": "3rd International Conference on Learning Representations, {ICLR} 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings",
        "editor": "Yoshua Bengio and Yann LeCun",
        "timestamp": "Wed, 17 Jul 2019 10:40:54 +0200",
        "title": "Neural Machine Translation by Jointly Learning to Align and Translate",
        "type": "inproceedings",
        "url": "http://arxiv.org/abs/1409.0473",
        "year": "2015"
    },
    "Chang2023": {
        "author": "\"Nadine Chang\",",
        "doi": "\"10.1184/R1/23396759.v1\"",
        "month": "\"6\",",
        "title": "\"{Bridging the Gap Between Human Vision and Computer Vision}\",",
        "type": "article",
        "url": "\"https://kilthub.cmu.edu/articles/thesis/Bridging_the_Gap_Between_Human_Vision_and_Computer_Vision/23396759\",",
        "year": "\"2023\","
    },
    "Changpinyo_2021_CVPR": {
        "author": "Changpinyo, Soravit and Sharma, Piyush and Ding, Nan and Soricut, Radu",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "3558-3568",
        "title": "Conceptual 12M: Pushing Web-Scale Image-Text Pre-Training To Recognize Long-Tail Visual Concepts",
        "type": "InProceedings",
        "year": "2021"
    },
    "Chen_2018_CVPR": {
        "author": "Chen, Jingwen and Chen, Jiawei and Chao, Hongyang and Yang, Ming",
        "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "Image Blind Denoising With Generative Adversarial Network Based Noise Modeling",
        "type": "InProceedings",
        "year": "2018"
    },
    "Chen_2023_ICCV": {
        "author": "Chen, Rui and Chen, Yongwei and Jiao, Ningxin and Jia, Kui",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "pages": "22246-22256",
        "title": "Fantasia3D: Disentangling Geometry and Appearance for High-quality Text-to-3D Content Creation",
        "type": "InProceedings",
        "year": "2023"
    },
    "Chen_2024_WACV": {
        "author": "Chen, Minghao and Laina, Iro and Vedaldi, Andrea",
        "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
        "month": "January",
        "pages": "5343-5353",
        "title": "Training-Free Layout Control With Cross-Attention Guidance",
        "type": "InProceedings",
        "year": "2024"
    },
    "Cho_2023_ICCV": {
        "author": "Cho, Jaemin and Zala, Abhay and Bansal, Mohit",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "pages": "3043-3054",
        "title": "DALL-Eval: Probing the Reasoning Skills and Social Biases of Text-to-Image Generation Models",
        "type": "InProceedings",
        "year": "2023"
    },
    "Cui_2018_CVPR": {
        "author": "Cui, Yin and Yang, Guandao and Veit, Andreas and Huang, Xun and Belongie, Serge",
        "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "Learning to Evaluate Image Captioning",
        "type": "InProceedings",
        "year": "2018"
    },
    "Desai_2021_CVPR": {
        "author": "Desai, Karan and Johnson, Justin",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "11162-11173",
        "title": "VirTex: Learning Visual Representations From Textual Annotations",
        "type": "InProceedings",
        "year": "2021"
    },
    "Gokhale2022BenchmarkingSR": {
        "author": "Tejas Gokhale and Hamid Palangi and Besmira Nushi and Vibhav Vineet and Eric Horvitz and Ece Kamar and Chitta Baral and Yezhou Yang",
        "journal": "ArXiv",
        "title": "Benchmarking Spatial Relationships in Text-to-Image Generation",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:254877055",
        "volume": "abs/2212.10015",
        "year": "2022"
    },
    "Goyal_2017_CVPR": {
        "author": "Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi",
        "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "July",
        "title": "Making the v in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering",
        "type": "InProceedings",
        "year": "2017"
    },
    "Grimal_2024_tiam": {
        "author": "Grimal, Paul and Le Borgne, Herv\\'e and Ferret, Olivier and Tourille, Julien",
        "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
        "month": "January",
        "pages": "2890-2899",
        "title": "TIAM - A Metric for Evaluating Alignment in Text-to-Image Generation",
        "type": "InProceedings",
        "year": "2024"
    },
    "Hu_2021_ICCV": {
        "author": "Hu, Ronghang and Singh, Amanpreet",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "pages": "1439-1449",
        "title": "UniT: Multimodal Multitask Learning With a Unified Transformer",
        "type": "InProceedings",
        "year": "2021"
    },
    "Hu_2023_ICCV": {
        "author": "Hu, Yushi and Liu, Benlin and Kasai, Jungo and Wang, Yizhong and Ostendorf, Mari and Krishna, Ranjay and Smith, Noah A.",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "pages": "20406-20417",
        "title": "TIFA: Accurate and Interpretable Text-to-Image Faithfulness Evaluation with Question Answering",
        "type": "InProceedings",
        "year": "2023"
    },
    "Hudson_2019_CVPR": {
        "author": "Hudson, Drew A. and Manning, Christopher D.",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering",
        "type": "InProceedings",
        "year": "2019"
    },
    "Karras_2019_CVPR": {
        "author": "Karras, Tero and Laine, Samuli and Aila, Timo",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "A Style-Based Generator Architecture for Generative Adversarial Networks",
        "type": "InProceedings",
        "year": "2019"
    },
    "Karras_2020_CVPR": {
        "author": "Karras, Tero and Laine, Samuli and Aittala, Miika and Hellsten, Janne and Lehtinen, Jaakko and Aila, Timo",
        "booktitle": "IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "Analyzing and Improving the Image Quality of StyleGAN",
        "type": "InProceedings",
        "year": "2020"
    },
    "Kirstain2023PickaPicAO": {
        "author": "Yuval Kirstain and Adam Polyak and Uriel Singer and Shahbuland Matiana and Joe Penna and Omer Levy",
        "journal": "ArXiv",
        "title": "Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image Generation",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:258437096",
        "volume": "abs/2305.01569",
        "year": "2023"
    },
    "Krishna2016VisualGC": {
        "author": "Ranjay Krishna and Yuke Zhu and Oliver Groth and Justin Johnson and Kenji Hata and Joshua Kravitz and Stephanie Chen and Yannis Kalantidis and Li-Jia Li and David A. Shamma and Michael S. Bernstein and Li Fei-Fei",
        "journal": "International Journal of Computer Vision",
        "pages": "32 - 73",
        "title": "Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:4492210",
        "volume": "123",
        "year": "2016"
    },
    "Li2023BLIP2BL": {
        "author": "Junnan Li and Dongxu Li and Silvio Savarese and Steven C. H. Hoi",
        "booktitle": "International Conference on Machine Learning",
        "title": "BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models",
        "type": "inproceedings",
        "url": "https://api.semanticscholar.org/CorpusID:256390509",
        "year": "2023"
    },
    "Li_2019_ICCV": {
        "author": "Li, Kunpeng and Zhang, Yulun and Li, Kai and Li, Yuanyuan and Fu, Yun",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "title": "Visual Semantic Reasoning for Image-Text Matching",
        "type": "InProceedings",
        "year": "2019"
    },
    "Li_2022_CVPR": {
        "author": "Li, Zhiheng and Min, Martin Renqiang and Li, Kai and Xu, Chenliang",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "18197-18207",
        "title": "StyleT2I: Toward Compositional and High-Fidelity Text-to-Image Synthesis",
        "type": "InProceedings",
        "year": "2022"
    },
    "Lin_2023_CVPR": {
        "author": "Lin, Chen-Hsuan and Gao, Jun and Tang, Luming and Takikawa, Towaki and Zeng, Xiaohui and Huang, Xun and Kreis, Karsten and Fidler, Sanja and Liu, Ming-Yu and Lin, Tsung-Yi",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "300-309",
        "title": "Magic3D: High-Resolution Text-to-3D Content Creation",
        "type": "InProceedings",
        "year": "2023"
    },
    "Liu2023VisualIT": {
        "author": "Haotian Liu and Chunyuan Li and Qingyang Wu and Yong Jae Lee",
        "journal": "ArXiv",
        "title": "Visual Instruction Tuning",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:258179774",
        "volume": "abs/2304.08485",
        "year": "2023"
    },
    "Ma_2023_CVPR": {
        "author": "Ma, Zixian and Hong, Jerry and Gul, Mustafa Omer and Gandhi, Mona and Gao, Irena and Krishna, Ranjay",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "10910-10921",
        "title": "CREPE: Can Vision-Language Foundation Models Reason Compositionally?",
        "type": "InProceedings",
        "year": "2023"
    },
    "Metzer_2023_CVPR": {
        "author": "Metzer, Gal and Richardson, Elad and Patashnik, Or and Giryes, Raja and Cohen-Or, Daniel",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "12663-12673",
        "title": "Latent-NeRF for Shape-Guided Generation of 3D Shapes and Textures",
        "type": "InProceedings",
        "year": "2023"
    },
    "NEURIPS2019_65699726": {
        "author": "Zhou, Sharon and Gordon, Mitchell and Krishna, Ranjay and Narcomey, Austin and Fei-Fei, Li F and Bernstein, Michael",
        "booktitle": "Advances in Neural Information Processing Systems",
        "editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett",
        "pages": "",
        "publisher": "Curran Associates, Inc.",
        "title": "HYPE: A Benchmark for Human eYe Perceptual Evaluation of Generative Models",
        "type": "inproceedings",
        "url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/65699726a3c601b9f31bf04019c8593c-Paper.pdf",
        "volume": "32",
        "year": "2019"
    },
    "NEURIPS2019_c74d97b0": {
        "author": "Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan",
        "booktitle": "Advances in Neural Information Processing Systems",
        "editor": "H. Wallach and H. Larochelle and A. Beygelzimer and F. d\\textquotesingle Alch\\'{e}-Buc and E. Fox and R. Garnett",
        "pages": "",
        "publisher": "Curran Associates, Inc.",
        "title": "ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks",
        "type": "inproceedings",
        "url": "https://proceedings.neurips.cc/paper_files/paper/2019/file/c74d97b01eae257e44aa9d5bade97baf-Paper.pdf",
        "volume": "32",
        "year": "2019"
    },
    "NIPS2011_5dd9db5e": {
        "author": "Ordonez, Vicente and Kulkarni, Girish and Berg, Tamara",
        "booktitle": "Advances in Neural Information Processing Systems",
        "editor": "J. Shawe-Taylor and R. Zemel and P. Bartlett and F. Pereira and K.Q. Weinberger",
        "pages": "",
        "publisher": "Curran Associates, Inc.",
        "title": "Im2Text: Describing Images Using 1 Million Captioned Photographs",
        "type": "inproceedings",
        "url": "https://proceedings.neurips.cc/paper_files/paper/2011/file/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf",
        "volume": "24",
        "year": "2011"
    },
    "NIPS2016_8a3363ab": {
        "author": "Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi and Chen, Xi",
        "booktitle": "Advances in Neural Information Processing Systems",
        "editor": "D. Lee and M. Sugiyama and U. Luxburg and I. Guyon and R. Garnett",
        "pages": "",
        "publisher": "Curran Associates, Inc.",
        "title": "Improved Techniques for Training GANs",
        "type": "inproceedings",
        "url": "https://proceedings.neurips.cc/paper_files/paper/2016/file/8a3363abe792db2d8761d6403605aeb7-Paper.pdf",
        "volume": "29",
        "year": "2016"
    },
    "Otani_2023_CVPR": {
        "author": "Otani, Mayu and Togashi, Riku and Sawai, Yu and Ishigami, Ryosuke and Nakashima, Yuta and Rahtu, Esa and Heikkil\\\"a, Janne and Satoh, Shin{\\textquoteright}ichi",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "14277-14286",
        "title": "Toward Verifiable and Reproducible Human Evaluation for Text-to-Image Generation",
        "type": "InProceedings",
        "year": "2023"
    },
    "Rombach_2022_CVPR": {
        "author": "Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\\\"orn",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "10684-10695",
        "title": "High-Resolution Image Synthesis With Latent Diffusion Models",
        "type": "InProceedings",
        "year": "2022"
    },
    "Ruiz_2023_CVPR": {
        "author": "Ruiz, Nataniel and Li, Yuanzhen and Jampani, Varun and Pritch, Yael and Rubinstein, Michael and Aberman, Kfir",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "22500-22510",
        "title": "DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation",
        "type": "InProceedings",
        "year": "2023"
    },
    "Sahin_2024_WACV": {
        "author": "Sahin, Ugur and Li, Hang and Khan, Qadeer and Cremers, Daniel and Tresp, Volker",
        "booktitle": "Proceedings of the IEEE/CVF Winter Conference on Applications of Computer Vision (WACV)",
        "month": "January",
        "pages": "5563-5573",
        "title": "Enhancing Multimodal Compositional Reasoning of Visual Language Models With Generative Negative Mining",
        "type": "InProceedings",
        "year": "2024"
    },
    "Salin_2023_ICCV": {
        "author": "Salin, Emmanuelle and Ayache, St\\'ephane and Favre, Benoit",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops",
        "month": "October",
        "pages": "339-352",
        "title": "Towards an Exhaustive Evaluation of Vision-Language Foundation Models",
        "type": "InProceedings",
        "year": "2023"
    },
    "Schramowski_2023_CVPR": {
        "author": "Schramowski, Patrick and Brack, Manuel and Deiseroth, Bj\\\"orn and Kersting, Kristian",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "22522-22531",
        "title": "Safe Latent Diffusion: Mitigating Inappropriate Degeneration in Diffusion Models",
        "type": "InProceedings",
        "year": "2023"
    },
    "Simonyan15vgg": {
        "author": "\"Karen Simonyan and Andrew Zisserman\",",
        "booktitle": "\"International Conference on Learning Representations\",",
        "title": "\"Very Deep Convolutional Networks for Large-Scale Image Recognition\",",
        "type": "InProceedings",
        "year": "\"2015\","
    },
    "Su2020VL-BERT": {
        "author": "Weijie Su and Xizhou Zhu and Yue Cao and Bin Li and Lewei Lu and Furu Wei and Jifeng Dai",
        "booktitle": "International Conference on Learning Representations",
        "title": "VL-BERT: Pre-training of Generic Visual-Linguistic Representations",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2020"
    },
    "Thrush_2022_CVPR": {
        "author": "Thrush, Tristan and Jiang, Ryan and Bartolo, Max and Singh, Amanpreet and Williams, Adina and Kiela, Douwe and Ross, Candace",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "5238-5248",
        "title": "Winoground: Probing Vision and Language Models for Visio-Linguistic Compositionality",
        "type": "InProceedings",
        "year": "2022"
    },
    "Tian2022GenerativeAN": {
        "author": "Chunwei Tian and Xuanyu Zhang and Chun-Wei Lin and Wangmeng Zuo and Yanning Zhang",
        "journal": "ArXiv",
        "title": "Generative Adversarial Networks for Image Super-Resolution: A Survey",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:248426817",
        "volume": "abs/2204.13620",
        "year": "2022"
    },
    "Tran_2020_ACCV": {
        "author": "Tran, Linh Duy and Nguyen, Son Minh and Arai, Masayuki",
        "booktitle": "Proceedings of the Asian Conference on Computer Vision (ACCV)",
        "month": "November",
        "title": "GAN-based Noise Model for Denoising Real Images",
        "type": "InProceedings",
        "year": "2020"
    },
    "Vedantam_2015_CVPR": {
        "author": "Vedantam, Ramakrishna and Lawrence Zitnick, C. and Parikh, Devi",
        "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "CIDEr: Consensus-Based Image Description Evaluation",
        "type": "InProceedings",
        "year": "2015"
    },
    "Wang_2023_CVPR": {
        "author": "Wang, Su and Saharia, Chitwan and Montgomery, Ceslee and Pont-Tuset, Jordi and Noy, Shai and Pellegrini, Stefano and Onoe, Yasumasa and Laszlo, Sarah and Fleet, David J. and Soricut, Radu and Baldridge, Jason and Norouzi, Mohammad and Anderson, Peter and Chan, William",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "18359-18369",
        "title": "Imagen Editor and EditBench: Advancing and Evaluating Text-Guided Image Inpainting",
        "type": "InProceedings",
        "year": "2023"
    },
    "Wu2023HumanPS": {
        "author": "Xiaoshi Wu and Yiming Hao and Keqiang Sun and Yixiong Chen and Feng Zhu and Rui Zhao and Hongsheng Li",
        "journal": "ArXiv",
        "title": "Human Preference Score v2: A Solid Benchmark for Evaluating Human Preferences of Text-to-Image Synthesis",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:259171771",
        "volume": "abs/2306.09341",
        "year": "2023"
    },
    "Wu_2023_ICCV": {
        "author": "Wu, Xiaoshi and Sun, Keqiang and Zhu, Feng and Zhao, Rui and Li, Hongsheng",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)",
        "month": "October",
        "pages": "2096-2105",
        "title": "Human Preference Score: Better Aligning Text-to-Image Models with Human Preference",
        "type": "InProceedings",
        "year": "2023"
    },
    "Yarom2023WhatYS": {
        "author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran. O. Ofek and Idan Szpektor",
        "journal": "ArXiv",
        "title": "What You See is What You Read? Improving Text-Image Alignment Evaluation",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:258740893",
        "volume": "abs/2305.10400",
        "year": "2023"
    },
    "Zellers_2019_CVPR": {
        "author": "Zellers, Rowan and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "From Recognition to Cognition: Visual Commonsense Reasoning",
        "type": "InProceedings",
        "year": "2019"
    },
    "Zeng2024IntentTunerAI": {
        "author": "Xingchen Zeng and Ziyao Gao and Yilin Ye and Wei Zeng",
        "journal": "ArXiv",
        "title": "IntentTuner: An Interactive Framework for Integrating Human Intents in Fine-tuning Text-to-Image Generative Models",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:267312299",
        "volume": "abs/2401.15559",
        "year": "2024"
    },
    "Zhang2020BERTScore": {
        "author": "Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi",
        "booktitle": "International Conference on Learning Representations",
        "title": "BERTScore: Evaluating Text Generation with BERT",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2020"
    },
    "Zhang2023CompressA": {
        "author": "Lei Zhang and Fangxun Shu and Sucheng Ren and Bingchen Zhao and Hao Jiang and Cihang Xie",
        "journal": "ArXiv",
        "title": "Compress \\& Align: Curating Image-Text Data with Human Knowledge",
        "type": "article",
        "url": "https://api.semanticscholar.org/CorpusID:266174263",
        "volume": "abs/2312.06726",
        "year": "2023"
    },
    "Zhang_2021_CVPR": {
        "author": "Zhang, Pengchuan and Li, Xiujun and Hu, Xiaowei and Yang, Jianwei and Zhang, Lei and Wang, Lijuan and Choi, Yejin and Gao, Jianfeng",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "5579-5588",
        "title": "VinVL: Revisiting Visual Representations in Vision-Language Models",
        "type": "InProceedings",
        "year": "2021"
    },
    "Zhou_2022_CVPR": {
        "author": "Zhou, Xingyi and Koltun, Vladlen and Kr\\\"ahenb\\\"uhl, Philipp",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "pages": "7571-7580",
        "title": "Simple Multi-Dataset Detection",
        "type": "InProceedings",
        "year": "2022"
    },
    "Zhou_2023_CVPR": {
        "author": "Zhou, Yutong and Shimada, Nobutaka",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops",
        "month": "June",
        "pages": "826-842",
        "title": "Vision + Language Applications: A Survey",
        "type": "InProceedings",
        "year": "2023"
    },
    "Zhu_2015_ICCV": {
        "author": "Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja",
        "booktitle": "Proceedings of the IEEE International Conference on Computer Vision (ICCV)",
        "month": "December",
        "title": "Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books",
        "type": "InProceedings",
        "year": "2015"
    },
    "Zitnick_2013_CVPR": {
        "author": "Zitnick, C. L. and Parikh, Devi",
        "booktitle": "Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)",
        "month": "June",
        "title": "Bringing Semantics into Focus Using Visual Abstraction",
        "type": "InProceedings",
        "year": "2013"
    },
    "ahmad2022new": {
        "author": "Ahmad, Waqar and Ali, Hazrat and Shah, Zubair and Azmat, Shoaib",
        "journal": "Scientific Reports",
        "number": "1",
        "pages": "9533",
        "publisher": "Nature Publishing Group UK London",
        "title": "A new generative adversarial network for medical images super resolution",
        "type": "article",
        "volume": "12",
        "year": "2022"
    },
    "bai2021mifid": {
        "author": "Bai, Ching-Yuan and Lin, Hsuan-Tien and Raffel, Colin and Kan, Wendy Chi-wen",
        "booktitle": "Proceedings of the 27th ACM SIGKDD Conference on Knowledge Discovery \\& Data Mining",
        "pages": "2534--2542",
        "title": "On training sample memorization: Lessons from benchmarking generative modeling with a large-scale competition",
        "type": "inproceedings",
        "year": "2021"
    },
    "balanced_vqa_v2": {
        "author": "Yash Goyal and Tejas Khot and Douglas Summers{-}Stay and Dhruv Batra and Devi Parikh",
        "booktitle": "Conference on Computer Vision and Pattern Recognition (CVPR)",
        "title": "Making the {V} in {VQA} Matter: Elevating the Role of Image Understanding in {V}isual {Q}uestion {A}nswering",
        "type": "InProceedings",
        "year": "2017"
    },
    "banerjee-lavie-2005-meteor": {
        "address": "\"Ann Arbor, Michigan\",",
        "author": "\"Banerjee, Satanjeev  and Lavie, Alon\",",
        "booktitle": "\"Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization\",",
        "editor": "\"Goldstein, Jade  and Lavie, Alon  and Lin, Chin-Yew  and Voss, Clare\",",
        "month": "jun,",
        "pages": "\"65--72\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/W05-0909\",",
        "year": "\"2005\","
    },
    "barratt2018note": {
        "author": "Barratt, Shane and Sharma, Rishi",
        "journal": "arXiv preprint arXiv:1801.01973",
        "title": "A note on the inception score",
        "type": "article",
        "year": "2018"
    },
    "baryshnikov2023hypernymy": {
        "author": "Baryshnikov, Anton and Ryabinin, Max",
        "journal": "arXiv preprint arXiv:2310.09247",
        "title": "Hypernymy Understanding Evaluation of Text-to-Image Models via WordNet Hierarchy",
        "type": "article",
        "year": "2023"
    },
    "betti2023let": {
        "author": "Betti, Federico and Staiano, Jacopo and Baraldi, Lorenzo and Baraldi, Lorenzo and Cucchiara, Rita and Sebe, Nicu",
        "booktitle": "Proceedings of the 31st ACM International Conference on Multimedia",
        "pages": "9306--9312",
        "title": "Let's ViCE! Mimicking Human Cognitive Behavior in Image Generation Evaluation",
        "type": "inproceedings",
        "year": "2023"
    },
    "binkowski2018kid": {
        "author": "Bi{\\'n}kowski, Miko{\\l}aj and Sutherland, Danica J and Arbel, Michael and Gretton, Arthur",
        "booktitle": "International Conference on Learning Representations",
        "title": "Demystifying MMD GANs",
        "type": "inproceedings",
        "year": "2018"
    },
    "borji2022pros": {
        "author": "Borji, Ali",
        "journal": "Computer Vision and Image Understanding",
        "pages": "103329",
        "publisher": "Elsevier",
        "title": "Pros and cons of GAN evaluation measures: New developments",
        "type": "article",
        "volume": "215",
        "year": "2022"
    },
    "brock2018large": {
        "author": "Brock, Andrew and Donahue, Jeff and Simonyan, Karen",
        "journal": "arXiv preprint arXiv:1809.11096",
        "title": "Large scale GAN training for high fidelity natural image synthesis",
        "type": "article",
        "year": "2018"
    },
    "brown2020language": {
        "author": "Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others",
        "journal": "Advances in neural information processing systems",
        "pages": "1877--1901",
        "title": "Language models are few-shot learners",
        "type": "article",
        "volume": "33",
        "year": "2020"
    },
    "caron2021emerging": {
        "author": "Caron, Mathilde and Touvron, Hugo and Misra, Ishan and J{\\'e}gou, Herv{\\'e} and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand",
        "booktitle": "Proceedings of the IEEE/CVF international conference on computer vision",
        "pages": "9650--9660",
        "title": "Emerging properties in self-supervised vision transformers",
        "type": "inproceedings",
        "year": "2021"
    },
    "castro2024clove": {
        "author": "Castro, Santiago and Ziai, Amir and Saluja, Avneesh and Yuan, Zhuoning and Mihalcea, Rada",
        "journal": "arXiv preprint arXiv:2402.15021",
        "title": "CLoVe: Encoding Compositional Language in Contrastive Vision-Language Models",
        "type": "article",
        "year": "2024"
    },
    "chambers2010flumejava": {
        "abstract": "MapReduce and similar systems significantly ease the task of writing data-parallel code. However, many real-world computations require a pipeline of MapReduces, and programming and managing such pipelines can be difficult. We present FlumeJava, a Java library that makes it easy to develop, test, and run efficient data-parallel pipelines. At the core of the FlumeJava library are a couple of classes that represent immutable parallel collections, each supporting a modest number of operations for processing them in parallel. Parallel collections and their operations present a simple, high-level, uniform abstraction over different data representations and execution strategies. To enable parallel operations to run efficiently, FlumeJava defers their evaluation, instead internally constructing an execution plan dataflow graph. When the final results of the parallel operations are eventually needed, FlumeJava first optimizes the execution plan, and then executes the optimized operations on appropriate underlying primitives (e.g., MapReduces). The combination of high-level abstractions for parallel data and computation, deferred evaluation and optimization, and efficient parallel primitives yields an easy-to-use system that approaches the efficiency of hand-optimized pipelines. FlumeJava is in active use by hundreds of pipeline developers within Google.",
        "address": "New York, NY, USA",
        "author": "Chambers, Craig and Raniwala, Ashish and Perry, Frances and Adams, Stephen and Henry, Robert R. and Bradshaw, Robert and Weizenbaum, Nathan",
        "booktitle": "Proceedings of the 31st ACM SIGPLAN Conference on Programming Language Design and Implementation",
        "doi": "10.1145/1806596.1806638",
        "isbn": "9781450300193",
        "keywords": "data-parallel programming, java, mapreduce",
        "location": "Toronto, Ontario, Canada",
        "numpages": "13",
        "pages": "363\u2013375",
        "publisher": "Association for Computing Machinery",
        "series": "PLDI '10",
        "title": "FlumeJava: easy, efficient data-parallel pipelines",
        "type": "inproceedings",
        "url": "https://doi.org/10.1145/1806596.1806638",
        "year": "2010"
    },
    "changpinyo2022all": {
        "author": "Changpinyo, Soravit and Kukliansky, Doron and Szpektor, Idan and Chen, Xi and Ding, Nan and Soricut, Radu",
        "journal": "arXiv preprint arXiv:2205.01883",
        "title": "All you may need for vqa are image captions",
        "type": "article",
        "year": "2022"
    },
    "chao:iccv2015": {
        "author": "Yu-Wei Chao and Zhan Wang and Yugeng He and Jiaxuan Wang and Jia Deng",
        "booktitle": "Proceedings of the IEEE International Conference on Computer Vision",
        "title": "HICO: A Benchmark for Recognizing Human-Object Interactions in Images",
        "type": "INPROCEEDINGS",
        "year": "2015"
    },
    "che2016mode": {
        "author": "Che, Tong and Li, Yanran and Jacob, Athul and Bengio, Yoshua and Li, Wenjie",
        "booktitle": "International Conference on Learning Representations",
        "title": "Mode Regularized Generative Adversarial Networks",
        "type": "inproceedings",
        "year": "2016"
    },
    "chefer2023attend": {
        "author": "Chefer, Hila and Alaluf, Yuval and Vinker, Yael and Wolf, Lior and Cohen-Or, Daniel",
        "journal": "ACM Transactions on Graphics (TOG)",
        "number": "4",
        "pages": "1--10",
        "publisher": "ACM New York, NY, USA",
        "title": "Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models",
        "type": "article",
        "volume": "42",
        "year": "2023"
    },
    "chen2015microsoft": {
        "author": "Chen, Xinlei and Fang, Hao and Lin, Tsung-Yi and Vedantam, Ramakrishna and Gupta, Saurabh and Doll{\\'a}r, Piotr and Zitnick, C Lawrence",
        "journal": "arXiv preprint arXiv:1504.00325",
        "title": "Microsoft coco captions: Data collection and evaluation server",
        "type": "article",
        "year": "2015"
    },
    "chen2020uniter": {
        "author": "Chen, Yen-Chun and Li, Linjie and Yu, Licheng and El Kholy, Ahmed and Ahmed, Faisal and Gan, Zhe and Cheng, Yu and Liu, Jingjing",
        "booktitle": "European conference on computer vision",
        "organization": "Springer",
        "pages": "104--120",
        "title": "Uniter: Universal image-text representation learning",
        "type": "inproceedings",
        "year": "2020"
    },
    "chen2022pali": {
        "author": "Chen, Xi and Wang, Xiao and Changpinyo, Soravit and Piergiovanni, AJ and Padlewski, Piotr and Salz, Daniel and Goodman, Sebastian and Grycner, Adam and Mustafa, Basil and Beyer, Lucas and others",
        "journal": "arXiv preprint arXiv:2209.06794",
        "title": "Pali: A jointly-scaled multilingual language-image model",
        "type": "article",
        "year": "2022"
    },
    "dash2017tac": {
        "author": "Dash, Ayushman and Gamboa, John Cristian Borges and Ahmed, Sheraz and Liwicki, Marcus and Afzal, Muhammad Zeshan",
        "journal": "arXiv preprint arXiv:1703.06412",
        "title": "Tac-gan-text conditioned auxiliary classifier generative adversarial network",
        "type": "article",
        "year": "2017"
    },
    "dehouche2023s": {
        "author": "Dehouche, Nassim and Dehouche, Kullathida",
        "journal": "Heliyon",
        "number": "6",
        "publisher": "Elsevier",
        "title": "What\u2019s in a text-to-image prompt? The potential of stable diffusion in visual arts education",
        "type": "article",
        "volume": "9",
        "year": "2023"
    },
    "devlin2018bert": {
        "author": "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina",
        "journal": "arXiv preprint arXiv:1810.04805",
        "title": "Bert: Pre-training of deep bidirectional transformers for language understanding",
        "type": "article",
        "year": "2018"
    },
    "dinh2022tise": {
        "author": "Dinh, Tan M and Nguyen, Rang and Hua, Binh-Son",
        "booktitle": "European Conference on Computer Vision",
        "organization": "Springer",
        "pages": "594--609",
        "title": "TISE: Bag of metrics for text-to-image synthesis evaluation",
        "type": "inproceedings",
        "year": "2022"
    },
    "du2007ergas": {
        "author": "Du, Qian and Younan, Nicholas H and King, Roger and Shah, Vijay P",
        "journal": "IEEE Geoscience and Remote Sensing Letters",
        "number": "4",
        "pages": "518--522",
        "publisher": "IEEE",
        "title": "On the performance evaluation of pan-sharpening techniques",
        "type": "article",
        "volume": "4",
        "year": "2007"
    },
    "dunlap2023describing": {
        "author": "Dunlap, Lisa and Zhang, Yuhui and Wang, Xiaohan and Zhong, Ruiqi and Darrell, Trevor and Steinhardt, Jacob and Gonzalez, Joseph E and Yeung-Levy, Serena",
        "journal": "arXiv preprint arXiv:2312.02974",
        "title": "Describing Differences in Image Sets with Natural Language",
        "type": "article",
        "year": "2023"
    },
    "faghri2018vse++": {
        "author": "Faghri, Fartash and Fleet, David J and Kiros, Jamie Ryan and Fidler, Sanja",
        "booktitle": "Proceedings of the British Machine Vision Conference ({BMVC})",
        "title": "VSE\\+\\+: Improving Visual-Semantic Embeddings with Hard Negatives",
        "type": "article",
        "url": "https://github.com/fartashf/vsepp",
        "year": "2018"
    },
    "feng2023trainingfree": {
        "author": "Weixi Feng and Xuehai He and Tsu-Jui Fu and Varun Jampani and Arjun Reddy Akula and Pradyumna Narayana and Sugato Basu and Xin Eric Wang and William Yang Wang",
        "booktitle": "The Eleventh International Conference on Learning Representations ",
        "title": "Training-Free Structured Diffusion Guidance for Compositional Text-to-Image Synthesis",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "fu2023dreamsim": {
        "author": "Fu, Stephanie and Tamir, Netanel and Sundaram, Shobhita and Chai, Lucy and Zhang, Richard and Dekel, Tali and Isola, Phillip",
        "journal": "arXiv preprint arXiv:2306.09344",
        "title": "DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data",
        "type": "article",
        "year": "2023"
    },
    "gal2022clipdirsim": {
        "author": "Gal, Rinon and Patashnik, Or and Maron, Haggai and Bermano, Amit H and Chechik, Gal and Cohen-Or, Daniel",
        "journal": "ACM Transactions on Graphics (TOG)",
        "number": "4",
        "pages": "1--13",
        "publisher": "ACM New York, NY, USA",
        "title": "StyleGAN-NADA: CLIP-guided domain adaptation of image generators",
        "type": "article",
        "volume": "41",
        "year": "2022"
    },
    "gan2020large": {
        "author": "Gan, Zhe and Chen, Yen-Chun and Li, Linjie and Zhu, Chen and Cheng, Yu and Liu, Jingjing",
        "journal": "Advances in Neural Information Processing Systems",
        "pages": "6616--6628",
        "title": "Large-scale adversarial training for vision-and-language representation learning",
        "type": "article",
        "volume": "33",
        "year": "2020"
    },
    "gordon2023mismatch": {
        "archiveprefix": "arXiv",
        "author": "Brian Gordon and Yonatan Bitton and Yonatan Shafir and Roopal Garg and Xi Chen and Dani Lischinski and Daniel Cohen-Or and Idan Szpektor",
        "eprint": "2312.03766",
        "primaryclass": "cs.CL",
        "title": "Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment",
        "type": "misc",
        "year": "2023"
    },
    "gretton2006kernel": {
        "author": "Gretton, Arthur and Borgwardt, Karsten and Rasch, Malte and Sch{\\\"o}lkopf, Bernhard and Smola, Alex",
        "journal": "Advances in neural information processing systems",
        "title": "A kernel method for the two-sample-problem",
        "type": "article",
        "volume": "19",
        "year": "2006"
    },
    "gu2020giqa": {
        "author": "Gu, Shuyang and Bao, Jianmin and Chen, Dong and Wen, Fang",
        "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XI 16",
        "organization": "Springer",
        "pages": "369--385",
        "title": "Giqa: Generated image quality assessment",
        "type": "inproceedings",
        "year": "2020"
    },
    "gu2023automated": {
        "author": "Gu, Siqi",
        "journal": "arXiv preprint arXiv:2312.12933",
        "title": "Automated Testing for Text-to-Image Software",
        "type": "article",
        "year": "2023"
    },
    "hartwig2022learning": {
        "abstract": "Abstract View quality measures compute scores for given views and are used to determine an optimal view in viewpoint selection tasks. Unfortunately, despite the wide adoption of these measures, they are rather based on computational quantities, such as entropy, than human preferences. To instead tailor viewpoint measures towards humans, view quality measures need to be able to capture human viewpoint preferences. Therefore, we introduce a large-scale crowdsourced data set, which contains 58k annotated viewpoints for 3220 ModelNet40 models. Based on this data, we derive a neural view quality measure abiding to human preferences. We further demonstrate that this view quality measure not only generalizes to models unseen during training, but also to unseen model categories. We are thus able to predict view qualities for single images, and directly predict human preferred viewpoints for 3D models by exploiting point-based learning technology, without requiring to generate intermediate images or sampling the view sphere. We will detail our data collection procedure, describe the data analysis and model training and will evaluate the predictive quality of our trained viewpoint measure on unseen models and categories. To our knowledge, this is the first deep learning approach to predict a view quality measure solely based on human preferences.",
        "author": "Hartwig, S. and Schelling, M. and Onzenoodt, C. v. and V\u00e1zquez, P.-P. and Hermosilla, P. and Ropinski, T.",
        "doi": "https://doi.org/10.1111/cgf.14613",
        "eprint": "https://onlinelibrary.wiley.com/doi/pdf/10.1111/cgf.14613",
        "journal": "Computer Graphics Forum",
        "keywords": "user studies, interaction, perceptually-based rendering, rendering",
        "number": "6",
        "pages": "453-466",
        "title": "Learning Human Viewpoint Preferences from Sparsely Annotated Models",
        "type": "article",
        "url": "https://onlinelibrary.wiley.com/doi/abs/10.1111/cgf.14613",
        "volume": "41",
        "year": "2022"
    },
    "he2016deep": {
        "author": "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian",
        "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
        "pages": "770--778",
        "title": "Deep residual learning for image recognition",
        "type": "inproceedings",
        "year": "2016"
    },
    "hessel2021clipscore": {
        "author": "Hessel, Jack and Holtzman, Ari and Forbes, Maxwell and Bras, Ronan Le and Choi, Yejin",
        "journal": "arXiv preprint arXiv:2104.08718",
        "title": "Clipscore: A reference-free evaluation metric for image captioning",
        "type": "article",
        "year": "2021"
    },
    "hinz2020semantic": {
        "author": "Hinz, Tobias and Heinrich, Stefan and Wermter, Stefan",
        "journal": "IEEE transactions on pattern analysis and machine intelligence",
        "number": "3",
        "pages": "1552--1565",
        "publisher": "IEEE",
        "title": "Semantic object accuracy for generative text-to-image synthesis",
        "type": "article",
        "volume": "44",
        "year": "2020"
    },
    "ho2022imagen": {
        "archiveprefix": "arXiv",
        "author": "Jonathan Ho and William Chan and Chitwan Saharia and Jay Whang and Ruiqi Gao and Alexey Gritsenko and Diederik P. Kingma and Ben Poole and Mohammad Norouzi and David J. Fleet and Tim Salimans",
        "eprint": "2210.02303",
        "primaryclass": "cs.CV",
        "title": "Imagen Video: High Definition Video Generation with Diffusion Models",
        "type": "misc",
        "year": "2022"
    },
    "ho2022video": {
        "archiveprefix": "arXiv",
        "author": "Jonathan Ho and Tim Salimans and Alexey Gritsenko and William Chan and Mohammad Norouzi and David J. Fleet",
        "eprint": "2204.03458",
        "primaryclass": "cs.CV",
        "title": "Video Diffusion Models",
        "type": "misc",
        "year": "2022"
    },
    "hochreiter1997lstm": {
        "author": "Hochreiter, Sepp and Schmidhuber, J{\\\"u}rgen",
        "journal": "Neural computation",
        "number": "8",
        "pages": "1735--1780",
        "publisher": "MIT press",
        "title": "Long short-term memory",
        "type": "article",
        "volume": "9",
        "year": "1997"
    },
    "hodosh2013framing": {
        "author": "Hodosh, Micah and Young, Peter and Hockenmaier, Julia",
        "journal": "Journal of Artificial Intelligence Research",
        "pages": "853--899",
        "title": "Framing image description as a ranking task: Data, models and evaluation metrics",
        "type": "article",
        "volume": "47",
        "year": "2013"
    },
    "honnibal2017spacy": {
        "author": "Honnibal, Matthew and Montani, Ines",
        "journal": "To appear",
        "number": "1",
        "pages": "411--420",
        "title": "spaCy 2: Natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing",
        "type": "article",
        "volume": "7",
        "year": "2017"
    },
    "hsieh2023sugarcrepe": {
        "author": "Cheng-Yu Hsieh and Jieyu Zhang and Zixian Ma and Aniruddha Kembhavi and Ranjay Krishna",
        "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
        "title": "SugarCrepe: Fixing Hackable Benchmarks for Vision-Language Compositionality",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "huang2023t2icompbench": {
        "author": "Kaiyi Huang and Kaiyue Sun and Enze Xie and Zhenguo Li and Xihui Liu",
        "journal": "arXiv preprint arXiv: 2307.06350",
        "title": "T2I-CompBench: A Comprehensive Benchmark for Open-world Compositional Text-to-image Generation",
        "type": "article",
        "year": "2023"
    },
    "ijcai2022p759": {
        "author": "Cao, Min and Li, Shiping and Li, Juntao and Nie, Liqiang and Zhang, Min",
        "booktitle": "Proceedings of the Thirty-First International Joint Conference on Artificial Intelligence, {IJCAI-22}",
        "doi": "10.24963/ijcai.2022/759",
        "editor": "Lud De Raedt",
        "month": "7",
        "note": "Survey Track",
        "pages": "5410--5417",
        "publisher": "International Joint Conferences on Artificial Intelligence Organization",
        "title": "Image-text Retrieval: A Survey on Recent Research and Development",
        "type": "inproceedings",
        "url": "https://doi.org/10.24963/ijcai.2022/759",
        "year": "2022"
    },
    "jenkins2019unsupervised": {
        "author": "Jenkins, Porter and Farag, Ahmad and Wang, Suhang and Li, Zhenhui",
        "booktitle": "Proceedings of the 28th ACM international conference on information and knowledge management",
        "pages": "1993--2002",
        "title": "Unsupervised representation learning of spatial data via multimodal embedding",
        "type": "inproceedings",
        "year": "2019"
    },
    "jiang2019tiger": {
        "author": "Jiang, Ming and Huang, Qiuyuan and Zhang, Lei and Wang, Xin and Zhang, Pengchuan and Gan, Zhe and Diesner, Jana and Gao, Jianfeng",
        "journal": "arXiv preprint arXiv:1909.02050",
        "title": "Tiger: Text-to-image grounding for image caption evaluation",
        "type": "article",
        "year": "2019"
    },
    "kamath-etal-2023-text": {
        "address": "\"Singapore\",",
        "author": "\"Kamath, Amita  and Hessel, Jack  and Chang, Kai-Wei\",",
        "booktitle": "\"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing\",",
        "doi": "\"10.18653/v1/2023.emnlp-main.301\",",
        "editor": "\"Bouamor, Houda  and Pino, Juan  and Bali, Kalika\",",
        "month": "dec,",
        "pages": "\"4933--4944\"",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"Text encoders bottleneck compositionality in contrastive vision-language models\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2023.emnlp-main.301\",",
        "year": "\"2023\","
    },
    "kamath2023s": {
        "author": "Kamath, Amita and Hessel, Jack and Chang, Kai-Wei",
        "journal": "arXiv preprint arXiv:2310.19785",
        "title": "What's\" up\" with vision-language models? Investigating their struggle with spatial reasoning",
        "type": "article",
        "year": "2023"
    },
    "kane-etal-2020-nubia": {
        "abstract": "\"We present NUBIA, a methodology to build automatic evaluation metrics for text generation using only machine learning models as core components. A typical NUBIA model is composed of three modules: a neural feature extractor, an aggregator and a calibrator. We demonstrate an implementation of NUBIA showing competitive performance with stateof-the art metrics used to evaluate machine translation and state-of-the art results for image captions quality evaluation. In addition to strong performance, NUBIA models have the advantage of being modular and improve in synergy with advances in text generation models.\",",
        "address": "\"Online (Dublin, Ireland)\",",
        "author": "\"Kane, Hassan  and Kocyigit, Muhammed Yusuf  and Abdalla, Ali  and Ajanoh, Pelkins  and Coulibali, Mohamed\",",
        "booktitle": "\"Proceedings of the 1st Workshop on Evaluating NLG Evaluation\",",
        "editor": "\"Agarwal, Shubham  and Du{\\v{s}}ek, Ond{\\v{r}}ej  and Gehrmann, Sebastian  and Gkatzia, Dimitra  and Konstas, Ioannis  and Van Miltenburg, Emiel  and Santhanam, Sashank\",",
        "month": "dec,",
        "pages": "\"28--37\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{NUBIA}: {N}e{U}ral Based Interchangeability Assessor for Text Generation\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2020.evalnlgeval-1.4\",",
        "year": "\"2020\","
    },
    "kang2023scaling": {
        "author": "Kang, Minguk and Zhu, Jun-Yan and Zhang, Richard and Park, Jaesik and Shechtman, Eli and Paris, Sylvain and Park, Taesung",
        "booktitle": "Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition",
        "pages": "10124--10134",
        "title": "Scaling up gans for text-to-image synthesis",
        "type": "inproceedings",
        "year": "2023"
    },
    "karras2019style": {
        "author": "Karras, Tero and Laine, Samuli and Aila, Timo",
        "booktitle": "Proceedings of the IEEE/CVF conference on computer vision and pattern recognition",
        "pages": "4401--4410",
        "title": "A style-based generator architecture for generative adversarial networks",
        "type": "inproceedings",
        "year": "2019"
    },
    "khashabi2020unifiedqa": {
        "author": "Khashabi, Daniel and Min, Sewon and Khot, Tushar and Sabharwal, Ashish and Tafjord, Oyvind and Clark, Peter and Hajishirzi, Hannaneh",
        "journal": "arXiv preprint arXiv:2005.00700",
        "title": "Unifiedqa: Crossing format boundaries with a single qa system",
        "type": "article",
        "year": "2020"
    },
    "kim2021vilt": {
        "author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo",
        "booktitle": "International Conference on Machine Learning",
        "organization": "PMLR",
        "pages": "5583--5594",
        "title": "Vilt: Vision-and-language transformer without convolution or region supervision",
        "type": "inproceedings",
        "year": "2021"
    },
    "kim2022mutual": {
        "author": "Kim, Jin-Hwa and Kim, Yunji and Lee, Jiyoung and Yoo, Kang Min and Lee, Sang-Woo",
        "journal": "Advances in Neural Information Processing Systems",
        "pages": "35072--35086",
        "title": "Mutual information divergence: A unified metric for multimodal generative models",
        "type": "article",
        "volume": "35",
        "year": "2022"
    },
    "kim2023imagine": {
        "author": "Yeongbin Kim and Gautam Singh and Junyeong Park and Caglar Gulcehre and Sungjin Ahn",
        "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
        "title": "Imagine the Unseen World: A Benchmark for Systematic Generalization in Visual World Models",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "kiros2014unifying": {
        "author": "Kiros, Ryan and Salakhutdinov, Ruslan and Zemel, Richard S",
        "journal": "arXiv preprint arXiv:1411.2539",
        "title": "Unifying visual-semantic embeddings with multimodal neural language models",
        "type": "article",
        "year": "2014"
    },
    "kiros2018illustrative": {
        "author": "Kiros, Jamie and Chan, William and Hinton, Geoffrey",
        "booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
        "pages": "922--933",
        "title": "Illustrative language understanding: Large-scale visual grounding with image search",
        "type": "inproceedings",
        "year": "2018"
    },
    "ku2023viescore": {
        "author": "Ku, Max and Jiang, Dongfu and Wei, Cong and Yue, Xiang and Chen, Wenhu",
        "journal": "arXiv preprint arXiv:2312.14867",
        "title": "VIEScore: Towards Explainable Metrics for Conditional Image Synthesis Evaluation",
        "type": "article",
        "year": "2023"
    },
    "kumar2023comprehensive": {
        "author": "Kumar, Satyam and Musharaf, Dayima and Musharaf, Seerat and Sagar, Anil Kumar",
        "booktitle": "International Conference on Advanced Communication and Intelligent Systems",
        "organization": "Springer",
        "pages": "90--103",
        "title": "A Comprehensive Review of the Latest Advancements in Large Generative AI Models",
        "type": "inproceedings",
        "year": "2023"
    },
    "kuznetsova2020open": {
        "author": "Kuznetsova, Alina and Rom, Hassan and Alldrin, Neil and Uijlings, Jasper and Krasin, Ivan and Pont-Tuset, Jordi and Kamali, Shahab and Popov, Stefan and Malloci, Matteo and Kolesnikov, Alexander and others",
        "journal": "International Journal of Computer Vision",
        "number": "7",
        "pages": "1956--1981",
        "publisher": "Springer",
        "title": "The open images dataset v4: Unified image classification, object detection, and visual relationship detection at scale",
        "type": "article",
        "volume": "128",
        "year": "2020"
    },
    "kynkaanniemi2019improved": {
        "author": "Kynk{\\\"a}{\\\"a}nniemi, Tuomas and Karras, Tero and Laine, Samuli and Lehtinen, Jaakko and Aila, Timo",
        "journal": "Advances in Neural Information Processing Systems",
        "title": "Improved precision and recall metric for assessing generative models",
        "type": "article",
        "volume": "32",
        "year": "2019"
    },
    "lavie2004significance": {
        "author": "Lavie, Alon and Sagae, Kenji and Jayaraman, Shyamsundar",
        "booktitle": "Machine Translation: From Real Users to Research: 6th Conference of the Association for Machine Translation in the Americas, AMTA 2004, Washington, DC, USA, September 28-October 2, 2004. Proceedings 6",
        "organization": "Springer",
        "pages": "134--143",
        "title": "The significance of recall in automatic metrics for MT evaluation",
        "type": "inproceedings",
        "year": "2004"
    },
    "lee-etal-2020-vilbertscore": {
        "address": "\"Online\",",
        "author": "\"Lee, Hwanhee  and Yoon, Seunghyun  and Dernoncourt, Franck  and Kim, Doo Soon  and Bui, Trung  and Jung, Kyomin\",",
        "booktitle": "\"Proceedings of the First Workshop on Evaluation and Comparison of NLP Systems\",",
        "doi": "\"10.18653/v1/2020.eval4nlp-1.4\",",
        "editor": "\"Eger, Steffen  and Gao, Yang  and Peyrard, Maxime  and Zhao, Wei  and Hovy, Eduard\",",
        "month": "nov,",
        "pages": "\"34--39\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{V}i{LBERTS}core: Evaluating Image Caption Using Vision-and-Language {BERT}\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2020.eval4nlp-1.4\",",
        "year": "\"2020\","
    },
    "lee2018stacked": {
        "author": "Lee, Kuang-Huei and Chen, Xi and Hua, Gang and Hu, Houdong and He, Xiaodong",
        "booktitle": "Proceedings of the European conference on computer vision (ECCV)",
        "pages": "201--216",
        "title": "Stacked cross attention for image-text matching",
        "type": "inproceedings",
        "year": "2018"
    },
    "lee2023aligning": {
        "archiveprefix": "arXiv",
        "author": "Kimin Lee and Hao Liu and Moonkyung Ryu and Olivia Watkins and Yuqing Du and Craig Boutilier and Pieter Abbeel and Mohammad Ghavamzadeh and Shixiang Shane Gu",
        "eprint": "2302.12192",
        "primaryclass": "cs.LG",
        "title": "Aligning Text-to-Image Models using Human Feedback",
        "type": "misc",
        "year": "2023"
    },
    "lee2023holistic": {
        "author": "Tony Lee and Michihiro Yasunaga and Chenlin Meng and Yifan Mai and Joon Sung Park and Agrim Gupta and Yunzhi Zhang and Deepak Narayanan and Hannah Benita Teufel and Marco Bellagente and Minguk Kang and Taesung Park and Jure Leskovec and Jun-Yan Zhu and Li Fei-Fei and Jiajun Wu and Stefano Ermon and Percy Liang",
        "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track",
        "title": "Holistic Evaluation of Text-to-Image Models",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "lee2023text": {
        "author": "Lee, Jaewoong and Jang, Sangwon and Jo, Jaehyeong and Yoon, Jaehong and Kim, Yunji and Kim, Jin-Hwa and Ha, Jung-Woo and Hwang, Sung Ju",
        "journal": "arXiv preprint arXiv:2304.01515",
        "title": "Text-Conditioned Sampling Framework for Text-to-Image Generation with Masked Generative Models",
        "type": "article",
        "year": "2023"
    },
    "lewis2022does": {
        "author": "Lewis, Martha and Nayak, Nihal V and Yu, Peilin and Yu, Qinan and Merullo, Jack and Bach, Stephen H and Pavlick, Ellie",
        "journal": "arXiv preprint arXiv:2212.10537",
        "title": "Does clip bind concepts? probing compositionality in large image models",
        "type": "article",
        "year": "2022"
    },
    "li-etal-2022-mplug": {
        "abstract": "\"Large-scale pre-trained foundation models have been an emerging paradigm for building artificial intelligence (AI) systems, which can be quickly adapted to a wide range of downstream tasks. This paper presents mPLUG, a new vision-language foundation model for both cross-modal understanding and generation. Most existing pre-trained models suffer from inefficiency and linguistic signal overwhelmed by long visual sequences in cross-modal alignment. To address both problems, mPLUG introduces an effective and efficient vision-language architecture with novel cross-modal skip-connections.mPLUG is pre-trained end-to-end on large-scale image-text pairs with both discriminative and generative objectives. It achieves state-of-the-art results on a wide range of vision-language downstream tasks, including image captioning, image-text retrieval, visual grounding and visual question answering. mPLUG also demonstrates strong zero-shot transferability on vision-language and video-language tasks. The code and pre-trained models are available at \\url{https://github.com/alibaba/AliceMind}\",",
        "address": "\"Abu Dhabi, United Arab Emirates\",",
        "author": "\"Li, Chenliang  and Xu, Haiyang  and Tian, Junfeng  and Wang, Wei  and Yan, Ming  and Bi, Bin  and Ye, Jiabo  and Chen, He  and Xu, Guohai  and Cao, Zheng  and Zhang, Ji  and Huang, Songfang  and Huang, Fei  and Zhou, Jingren  and Si, Luo\",",
        "booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing\",",
        "doi": "\"10.18653/v1/2022.emnlp-main.488\",",
        "editor": "\"Goldberg, Yoav  and Kozareva, Zornitsa  and Zhang, Yue\",",
        "month": "dec,",
        "pages": "\"7241--7259\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"m{PLUG}: Effective and Efficient Vision-Language Learning by Cross-modal Skip-connections\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2022.emnlp-main.488\",",
        "year": "\"2022\","
    },
    "li2020oscar": {
        "author": "Li, Xiujun and Yin, Xi and Li, Chunyuan and Zhang, Pengchuan and Hu, Xiaowei and Zhang, Lei and Wang, Lijuan and Hu, Houdong and Dong, Li and Wei, Furu and others",
        "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXX 16",
        "organization": "Springer",
        "pages": "121--137",
        "title": "Oscar: Object-semantics aligned pre-training for vision-language tasks",
        "type": "inproceedings",
        "year": "2020"
    },
    "li2022blip": {
        "author": "Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven",
        "booktitle": "International Conference on Machine Learning",
        "organization": "PMLR",
        "pages": "12888--12900",
        "title": "Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation",
        "type": "inproceedings",
        "year": "2022"
    },
    "li2023covlm": {
        "author": "Li, Junyan and Chen, Delin and Hong, Yining and Chen, Zhenfang and Chen, Peihao and Shen, Yikang and Gan, Chuang",
        "journal": "arXiv preprint arXiv:2311.03354",
        "title": "CoVLM: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding",
        "type": "article",
        "year": "2023"
    },
    "li2023divide": {
        "author": "Li, Yumeng and Keuper, Margret and Zhang, Dan and Khoreva, Anna",
        "journal": "arXiv preprint arXiv:2307.10864",
        "title": "Divide \\& bind your attention for improved generative semantic nursing",
        "type": "article",
        "year": "2023"
    },
    "li2024compositional": {
        "author": "Junyan Li and Delin Chen and Yining Hong and Zhenfang Chen and Peihao Chen and Yikang Shen and Chuang Gan",
        "booktitle": "The Twelfth International Conference on Learning Representations",
        "title": "Compositional {VLM}: Composing Visual Entities and Relationships in Large Language Models Via Communicative Decoding",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2024"
    },
    "liang2020cpgan": {
        "author": "Liang, Jiadong and Pei, Wenjie and Lu, Feng",
        "booktitle": "Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part IV 16",
        "organization": "Springer",
        "pages": "491--508",
        "title": "Cpgan: Content-parsing generative adversarial networks for text-to-image synthesis",
        "type": "inproceedings",
        "year": "2020"
    },
    "liang2023rich": {
        "author": "Liang, Youwei and He, Junfeng and Li, Gang and Li, Peizhao and Klimovskiy, Arseniy and Carolan, Nicholas and Sun, Jiao and Pont-Tuset, Jordi and Young, Sarah and Yang, Feng and others",
        "journal": "arXiv preprint arXiv:2312.10240",
        "title": "Rich Human Feedback for Text-to-Image Generation",
        "type": "article",
        "year": "2023"
    },
    "lin-2004-rouge": {
        "address": "\"Barcelona, Spain\",",
        "author": "\"Lin, Chin-Yew\",",
        "booktitle": "\"Text Summarization Branches Out\",",
        "month": "jul,",
        "pages": "\"74--81\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{ROUGE}: A Package for Automatic Evaluation of Summaries\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/W04-1013\",",
        "year": "\"2004\","
    },
    "lin2014microsoft": {
        "author": "Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\\'a}r, Piotr and Zitnick, C Lawrence",
        "booktitle": "Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13",
        "organization": "Springer",
        "pages": "740--755",
        "title": "Microsoft coco: Common objects in context",
        "type": "inproceedings",
        "year": "2014"
    },
    "lin2023revisiting": {
        "author": "Lin, Zhiqiu and Chen, Xinyue and Pathak, Deepak and Zhang, Pengchuan and Ramanan, Deva",
        "journal": "arXiv preprint arXiv:2306.01879",
        "title": "Revisiting the Role of Language Priors in Vision-Language Models",
        "type": "article",
        "year": "2023"
    },
    "liu2021swin": {
        "author": "Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining",
        "booktitle": "Proceedings of the IEEE/CVF international conference on computer vision",
        "pages": "10012--10022",
        "title": "Swin transformer: Hierarchical vision transformer using shifted windows",
        "type": "inproceedings",
        "year": "2021"
    },
    "liu2022compositional": {
        "author": "Liu, Nan and Li, Shuang and Du, Yilun and Torralba, Antonio and Tenenbaum, Joshua B",
        "booktitle": "European Conference on Computer Vision",
        "organization": "Springer",
        "pages": "423--439",
        "title": "Compositional visual generation with composable diffusion models",
        "type": "inproceedings",
        "year": "2022"
    },
    "liu2024fetv": {
        "author": "Liu, Yuanxin and Li, Lei and Ren, Shuhuai and Gao, Rundong and Li, Shicheng and Chen, Sishuo and Sun, Xu and Hou, Lu",
        "journal": "Advances in Neural Information Processing Systems",
        "title": "Fetv: A benchmark for fine-grained evaluation of open-domain text-to-video generation",
        "type": "article",
        "volume": "36",
        "year": "2024"
    },
    "lopez2016revisiting": {
        "author": "Lopez-Paz, David and Oquab, Maxime",
        "booktitle": "International Conference on Learning Representations",
        "title": "Revisiting Classifier Two-Sample Tests",
        "type": "inproceedings",
        "year": "2016"
    },
    "lu2023llmscore": {
        "author": "Yujie Lu and Xianjun Yang and Xiujun Li and Xin Eric Wang and William Yang Wang",
        "booktitle": "Thirty-seventh Conference on Neural Information Processing Systems",
        "title": "LLMScore: Unveiling the Power of Large Language Models in Text-to-Image Synthesis Evaluation",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "ma2023examination": {
        "author": "Ma, Teli and Li, Rong and Liang, Junwei",
        "journal": "arXiv preprint arXiv:2308.10509",
        "title": "An examination of the compositionality of large generative vision-language models",
        "type": "article",
        "year": "2023"
    },
    "ma2024cobra": {
        "author": "Ma, Zheng and Wang, Changxin and Ouyang, Yawen and Zhao, Fei and Zhang, Jianbing and Huang, Shujian and Chen, Jiajun",
        "journal": "arXiv preprint arXiv:2402.11572",
        "title": "Cobra Effect in Reference-Free Image Captioning Metrics",
        "type": "article",
        "year": "2024"
    },
    "madhyastha-etal-2019-vifidel": {
        "address": "\"Florence, Italy\",",
        "author": "\"Madhyastha, Pranava  and Wang, Josiah  and Specia, Lucia\",",
        "booktitle": "\"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",",
        "doi": "\"10.18653/v1/P19-1654\",",
        "editor": "\"Korhonen, Anna  and Traum, David  and M{\\`a}rquez, Llu{\\'\\i}s\",",
        "month": "jul,",
        "pages": "\"6539--6550\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{VIFIDEL}: Evaluating the Visual Fidelity of Image Descriptions\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/P19-1654\",",
        "year": "\"2019\","
    },
    "mao2016training": {
        "abstract": "In this paper, we focus on training and evaluating effective word embeddings with both text and visual information. More specifically, we introduce a large-scale dataset with 300 million sentences describing over 40 million images crawled and downloaded from publicly available Pins (i.e. an image with sentence descriptions uploaded by users) on Pinterest [2]. This dataset is more than 200 times larger than MS COCO [22], the standard large-scale image dataset with sentence descriptions. In addition, we construct an evaluation dataset to directly assess the effectiveness of word embeddings in terms of finding semantically similar or related words and phrases. The word/phrase pairs in this evaluation dataset are collected from the click data with millions of users in an image search system, thus contain rich semantic relationships. Based on these datasets, we propose and compare several Recurrent Neural Networks (RNNs) based multimodal (text and image) models. Experiments show that our model benefits from incorporating the visual information into the word embeddings, and a weight sharing strategy is crucial for learning such multimodal embeddings. The project page is: http://www.stat.ucla.edu/~junhua.mao/multimodal_embedding.html.",
        "address": "Red Hook, NY, USA",
        "author": "Mao, Junhua and Xu, Jiajing and Jing, Yushi and Yuille, Alan",
        "booktitle": "Proceedings of the 30th International Conference on Neural Information Processing Systems",
        "isbn": "9781510838819",
        "location": "Barcelona, Spain",
        "numpages": "9",
        "pages": "442\u2013450",
        "publisher": "Curran Associates Inc.",
        "series": "NIPS'16",
        "title": "Training and evaluating multimodal word embeddings with large-scale web annotated images",
        "type": "inproceedings",
        "year": "2016"
    },
    "marcus2022preliminary": {
        "archiveprefix": "arXiv",
        "author": "Gary Marcus and Ernest Davis and Scott Aaronson",
        "eprint": "2204.13807",
        "primaryclass": "cs.CV",
        "title": "A very preliminary analysis of DALL-E 2",
        "type": "misc",
        "year": "2022"
    },
    "menendez1997jensen": {
        "abstract": "In this paper we investigate the Jensen-Shannon parametric divergence for testing goodness-of-fit for point estimation. Most of the work presented is an analytical study of the asymptotic differences between different members of the family proposed in goodness of fit, together with an examination of closer approximations to the exact distribution of these statistics than the commonly used chi-squared distribution. Finally the minimum Jensen-Shannon divergence estimates are introduced and compared with other well-known estimators by computer simulation.",
        "author": "M.L. Men\u00e9ndez and J.A. Pardo and L. Pardo and M.C. Pardo",
        "doi": "https://doi.org/10.1016/S0016-0032(96)00063-4",
        "issn": "0016-0032",
        "journal": "Journal of the Franklin Institute",
        "number": "2",
        "pages": "307-318",
        "title": "The Jensen-Shannon divergence",
        "type": "article",
        "url": "https://www.sciencedirect.com/science/article/pii/S0016003296000634",
        "volume": "334",
        "year": "1997"
    },
    "miller1995Wordnet": {
        "abstract": "Because meaningful sentences are composed of meaningful words, any system that hopes to process natural languages as people do must have information about words and their meanings. This information is traditionally provided through dictionaries, and machine-readable dictionaries are now widely available. But dictionary entries evolved for the convenience of human readers, not for machines. WordNet1 provides a more effective combination of traditional lexicographic information and modern computing. WordNet is an online lexical database designed for use under program control. English nouns, verbs, adjectives, and adverbs are organized into sets of synonyms, each representing a lexicalized concept. Semantic relations link the synonym sets [4].",
        "address": "New York, NY, USA",
        "author": "Miller, George A.",
        "doi": "10.1145/219717.219748",
        "issn": "0001-0782",
        "issue_date": "Nov. 1995",
        "journal": "Commun. ACM",
        "month": "nov",
        "number": "11",
        "numpages": "3",
        "pages": "39\u201341",
        "publisher": "Association for Computing Machinery",
        "title": "WordNet: a lexical database for English",
        "type": "article",
        "url": "https://doi.org/10.1145/219717.219748",
        "volume": "38",
        "year": "1995"
    },
    "minderer2022simple": {
        "author": "Minderer, M and Gritsenko, A and Stone, A and Neumann, M and Weissenborn, D and Dosovitskiy, A and Mahendran, A and Arnab, A and Dehghani, M and Shen, Z and others",
        "journal": "arXiv preprint arXiv:2205.06230",
        "title": "Simple open-vocabulary object detection with vision transformers. arxiv 2022",
        "type": "article",
        "volume": "2",
        "year": "2022"
    },
    "moorthy2011blind": {
        "author": "Moorthy, Anush Krishna and Bovik, Alan Conrad",
        "journal": "IEEE transactions on Image Processing",
        "number": "12",
        "pages": "3350--3364",
        "publisher": "IEEE",
        "title": "Blind image quality assessment: From natural scene statistics to perceptual quality",
        "type": "article",
        "volume": "20",
        "year": "2011"
    },
    "openai2023gpt": {
        "author": "OpenAI, R",
        "journal": "View in Article",
        "pages": "13",
        "title": "Gpt-4 technical report. arxiv 2303.08774",
        "type": "article",
        "volume": "2",
        "year": "2023"
    },
    "park2021benchmark": {
        "author": "Dong Huk Park and Samaneh Azadi and Xihui Liu and Trevor Darrell and Anna Rohrbach",
        "booktitle": "Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1)",
        "title": "Benchmark for Compositional Text-to-Image Synthesis",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2021"
    },
    "pascal-voc-2008": {
        "author": "\"Everingham, M. and Van~Gool, L. and Williams, C. K. I. and Winn, J. and Zisserman, A.\",",
        "howpublished": "\"http://www.pascal-network.org/challenges/VOC/voc2008/workshop/index.html\"",
        "title": "\"The {PASCAL} {V}isual {O}bject {C}lasses {C}hallenge 2008 {(VOC2008)} {R}esults\",",
        "type": "misc"
    },
    "plummer2015flickr30kentities": {
        "author": "Plummer, Bryan A. and Wang, Liwei and Cervantes, Chris M. and Caicedo, Juan C. and Hockenmaier, Julia and Lazebnik, Svetlana",
        "booktitle": "2015 IEEE International Conference on Computer Vision (ICCV)",
        "doi": "10.1109/ICCV.2015.303",
        "keywords": "Standards;Benchmark testing;Image resolution;Grounding;Glass;Training;Image color analysis",
        "number": "",
        "pages": "2641-2649",
        "title": "Flickr30k Entities: Collecting Region-to-Phrase Correspondences for Richer Image-to-Sentence Models",
        "type": "INPROCEEDINGS",
        "volume": "",
        "year": "2015"
    },
    "pmlr-v139-cho21a": {
        "abstract": "\t {Existing methods for vision-and-language learning typically require designing task-specific architectures and objectives for each task. For example, a multi-label answer classifier for visual question answering, a region scorer for referring expression comprehension, and a language decoder for image captioning, etc. To alleviate these hassles, in this work, we propose a unified framework that learns different tasks in a single architecture with the same language modeling objective, i.e., multimodal conditional text generation, where our models learn to generate labels in text based on the visual and textual inputs. On 7 popular vision-and-language benchmarks, including visual question answering, referring expression comprehension, visual commonsense reasoning, most of which have been previously modeled as discriminative tasks, our generative approach (with a single unified architecture) reaches comparable performance to recent task-specific state-of-the-art vision-and-language models. Moreover, our generative approach shows better generalization ability on questions that have rare answers. Also, we show that our framework allows multi-task learning in a single architecture with a single set of parameters, achieving similar performance to separately optimized single-task models. Our code is publicly available at: https://github.com/j-min/VL-T5",
        "author": "Cho, Jaemin and Lei, Jie and Tan, Hao and Bansal, Mohit",
        "booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning",
        "editor": "\t {Meila, Marina and Zhang, Tong",
        "month": "\t {18--24 Jul",
        "pages": "\t {1931--1942",
        "pdf": "\t {http://proceedings.mlr.press/v139/cho21a/cho21a.pdf",
        "publisher": "PMLR",
        "series": "\t {Proceedings of Machine Learning Research",
        "title": "\t {Unifying Vision-and-Language Tasks via Text Generation",
        "type": "InProceedings",
        "url": "\t {https://proceedings.mlr.press/v139/cho21a.html",
        "volume": "\t {139",
        "year": "\t {2021"
    },
    "pmlr-v139-kim21k": {
        "abstract": "\t {Vision-and-Language Pre-training (VLP) has improved performance on various joint vision-and-language downstream tasks. Current approaches to VLP heavily rely on image feature extraction processes, most of which involve region supervision (e.g., object detection) and the convolutional architecture (e.g., ResNet). Although disregarded in the literature, we find it problematic in terms of both (1) efficiency/speed, that simply extracting input features requires much more computation than the multimodal interaction steps; and (2) expressive power, as it is upper bounded to the expressive power of the visual embedder and its predefined visual vocabulary. In this paper, we present a minimal VLP model, Vision-and-Language Transformer (ViLT), monolithic in the sense that the processing of visual inputs is drastically simplified to just the same convolution-free manner that we process textual inputs. We show that ViLT is up to tens of times faster than previous VLP models, yet with competitive or better downstream task performance. Our code and pre-trained weights are available at https://github.com/dandelin/vilt.",
        "author": "Kim, Wonjae and Son, Bokyung and Kim, Ildoo",
        "booktitle": "\t {Proceedings of the 38th International Conference on Machine Learning",
        "editor": "\t {Meila, Marina and Zhang, Tong",
        "month": "\t {18--24 Jul",
        "pages": "\t {5583--5594",
        "pdf": "\t {http://proceedings.mlr.press/v139/kim21k/kim21k.pdf",
        "publisher": "PMLR",
        "series": "\t {Proceedings of Machine Learning Research",
        "title": "\t {ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision",
        "type": "InProceedings",
        "url": "\t {http://proceedings.mlr.press/v139/kim21k.html",
        "volume": "\t {139",
        "year": "\t {2021"
    },
    "pmlr-v37-kusnerb15": {
        "abstract": "\t {We present the Word Mover\u2019s Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local co-occurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to \"travel\" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover\u2019s Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.",
        "address": "\t {Lille, France",
        "author": "\t {Kusner, Matt and Sun, Yu and Kolkin, Nicholas and Weinberger, Kilian",
        "booktitle": "\t {Proceedings of the 32nd International Conference on Machine Learning",
        "editor": "\t {Bach, Francis and Blei, David",
        "month": "\t {07--09 Jul",
        "pages": "\t {957--966",
        "pdf": "\t {http://proceedings.mlr.press/v37/kusnerb15.pdf",
        "publisher": "PMLR",
        "series": "\t {Proceedings of Machine Learning Research",
        "title": "\t {From Word Embeddings To Document Distances",
        "type": "InProceedings",
        "url": "\t {https://proceedings.mlr.press/v37/kusnerb15.html",
        "volume": "\t {37",
        "year": "\t {2015"
    },
    "po2023state": {
        "archiveprefix": "arXiv",
        "author": "Ryan Po and Wang Yifan and Vladislav Golyanik and Kfir Aberman and Jonathan T. Barron and Amit H. Bermano and Eric Ryan Chan and Tali Dekel and Aleksander Holynski and Angjoo Kanazawa and C. Karen Liu and Lingjie Liu and Ben Mildenhall and Matthias Nie\u00dfner and Bj\u00f6rn Ommer and Christian Theobalt and Peter Wonka and Gordon Wetzstein",
        "eprint": "2310.07204",
        "primaryclass": "cs.AI",
        "title": "State of the Art on Diffusion Models for Visual Computing",
        "type": "misc",
        "year": "2023"
    },
    "prabhudesai2023aligning": {
        "archiveprefix": "arXiv",
        "author": "Mihir Prabhudesai and Anirudh Goyal and Deepak Pathak and Katerina Fragkiadaki",
        "eprint": "2310.03739",
        "primaryclass": "cs.CV",
        "title": "Aligning Text-to-Image Diffusion Models with Reward Backpropagation",
        "type": "misc",
        "year": "2023"
    },
    "radford2019language": {
        "author": "Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others",
        "journal": "OpenAI blog",
        "number": "8",
        "pages": "9",
        "title": "Language models are unsupervised multitask learners",
        "type": "article",
        "volume": "1",
        "year": "2019"
    },
    "radford2021learning": {
        "author": "Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others",
        "booktitle": "International conference on machine learning",
        "organization": "PMLR",
        "pages": "8748--8763",
        "title": "Learning transferable visual models from natural language supervision",
        "type": "inproceedings",
        "year": "2021"
    },
    "ramesh2021zero": {
        "author": "Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya",
        "booktitle": "International conference on machine learning",
        "organization": "Pmlr",
        "pages": "8821--8831",
        "title": "Zero-shot text-to-image generation",
        "type": "inproceedings",
        "year": "2021"
    },
    "rashtchian2010collecting": {
        "author": "Rashtchian, Cyrus and Young, Peter and Hodosh, Micah and Hockenmaier, Julia",
        "booktitle": "Proceedings of the NAACL HLT 2010 workshop on creating speech and language data with Amazon\u2019s Mechanical Turk",
        "pages": "139--147",
        "title": "Collecting image annotations using amazon\u2019s mechanical turk",
        "type": "inproceedings",
        "year": "2010"
    },
    "ravuri2019classification": {
        "author": "Ravuri, Suman and Vinyals, Oriol",
        "journal": "Advances in neural information processing systems",
        "title": "Classification accuracy score for conditional generative models",
        "type": "article",
        "volume": "32",
        "year": "2019"
    },
    "ray2024cola": {
        "author": "Ray, Arijit and Radenovic, Filip and Dubey, Abhimanyu and Plummer, Bryan and Krishna, Ranjay and Saenko, Kate",
        "journal": "Advances in Neural Information Processing Systems",
        "title": "cola: A Benchmark for Compositional Text-to-image Retrieval",
        "type": "article",
        "volume": "36",
        "year": "2024"
    },
    "reed2016generative": {
        "author": "Reed, Scott and Akata, Zeynep and Yan, Xinchen and Logeswaran, Lajanugen and Schiele, Bernt and Lee, Honglak",
        "booktitle": "International conference on machine learning",
        "organization": "PMLR",
        "pages": "1060--1069",
        "title": "Generative adversarial text to image synthesis",
        "type": "inproceedings",
        "year": "2016"
    },
    "reed2016learning": {
        "author": "Reed, Scott E and Akata, Zeynep and Mohan, Santosh and Tenka, Samuel and Schiele, Bernt and Lee, Honglak",
        "journal": "Advances in neural information processing systems",
        "title": "Learning what and where to draw",
        "type": "article",
        "volume": "29",
        "year": "2016"
    },
    "reis2023real": {
        "author": "Reis, Dillon and Kupec, Jordan and Hong, Jacqueline and Daoudi, Ahmad",
        "journal": "arXiv preprint arXiv:2305.09972",
        "title": "Real-time flying object detection with YOLOv8",
        "type": "article",
        "year": "2023"
    },
    "roberts2022scaling": {
        "archiveprefix": "arXiv",
        "author": "Adam Roberts and Hyung Won Chung and Anselm Levskaya and Gaurav Mishra and James Bradbury and Daniel Andor and Sharan Narang and Brian Lester and Colin Gaffney and Afroz Mohiuddin and Curtis Hawthorne and Aitor Lewkowycz and Alex Salcianu and Marc van Zee and Jacob Austin and Sebastian Goodman and Livio Baldini Soares and Haitang Hu and Sasha Tsvyashchenko and Aakanksha Chowdhery and Jasmijn Bastings and Jannis Bulian and Xavier Garcia and Jianmo Ni and Andrew Chen and Kathleen Kenealy and Jonathan H. Clark and Stephan Lee and Dan Garrette and James Lee-Thorp and Colin Raffel and Noam Shazeer and Marvin Ritter and Maarten Bosma and Alexandre Passos and Jeremy Maitin-Shepard and Noah Fiedel and Mark Omernick and Brennan Saeta and Ryan Sepassi and Alexander Spiridonov and Joshua Newlan and Andrea Gesmundo",
        "eprint": "2203.17189",
        "primaryclass": "cs.LG",
        "title": "Scaling Up Models and Data with $\\texttt{t5x}$ and $\\texttt{seqio}$",
        "type": "misc",
        "year": "2022"
    },
    "rohrbach2016movie": {
        "archiveprefix": "arXiv",
        "author": "Anna Rohrbach and Atousa Torabi and Marcus Rohrbach and Niket Tandon and Christopher Pal and Hugo Larochelle and Aaron Courville and Bernt Schiele",
        "eprint": "1605.03705",
        "primaryclass": "cs.CV",
        "title": "Movie Description",
        "type": "misc",
        "year": "2016"
    },
    "saharia2022photorealistic": {
        "author": "Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily L and Ghasemipour, Kamyar and Gontijo Lopes, Raphael and Karagol Ayan, Burcu and Salimans, Tim and others",
        "journal": "Advances in Neural Information Processing Systems",
        "pages": "36479--36494",
        "title": "Photorealistic text-to-image diffusion models with deep language understanding",
        "type": "article",
        "volume": "35",
        "year": "2022"
    },
    "sajjadi2018assessing": {
        "author": "Sajjadi, Mehdi SM and Bachem, Olivier and Lucic, Mario and Bousquet, Olivier and Gelly, Sylvain",
        "journal": "Advances in neural information processing systems",
        "title": "Assessing generative models via precision and recall",
        "type": "article",
        "volume": "31",
        "year": "2018"
    },
    "schuhmann2022laion": {
        "author": "Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others",
        "journal": "Advances in Neural Information Processing Systems",
        "pages": "25278--25294",
        "title": "Laion-5b: An open large-scale dataset for training next generation image-text models",
        "type": "article",
        "volume": "35",
        "year": "2022"
    },
    "sharma2018conceptual": {
        "author": "Sharma, Piyush and Ding, Nan and Goodman, Sebastian and Soricut, Radu",
        "booktitle": "Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
        "pages": "2556--2565",
        "title": "Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning",
        "type": "inproceedings",
        "year": "2018"
    },
    "singh2023coarse": {
        "author": "Singh, Harman and Zhang, Pengchuan and Wang, Qifan and Wang, Mengjiao and Xiong, Wenhan and Du, Jingfei and Chen, Yu",
        "journal": "arXiv preprint arXiv:2305.13812",
        "title": "Coarse-to-Fine Contrastive Learning in Image-Text-Graph Space for Improved Vision-Language Compositionality",
        "type": "article",
        "year": "2023"
    },
    "singh2023divide": {
        "author": "Singh, Jaskirat and Zheng, Liang",
        "journal": "arXiv preprint arXiv:2307.04749",
        "title": "Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image Alignment with Iterative VQA Feedback",
        "type": "article",
        "year": "2023"
    },
    "suhr2017corpus": {
        "author": "Suhr, Alane and Lewis, Mike and Yeh, James and Artzi, Yoav",
        "booktitle": "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
        "pages": "217--223",
        "title": "A corpus of natural language for visual reasoning",
        "type": "inproceedings",
        "year": "2017"
    },
    "szegedy2015going": {
        "author": "Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew",
        "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
        "pages": "1--9",
        "title": "Going deeper with convolutions",
        "type": "inproceedings",
        "year": "2015"
    },
    "szegedy2016inception": {
        "author": "Szegedy, Christian and Vanhoucke, Vincent and Ioffe, Sergey and Shlens, Jon and Wojna, Zbigniew",
        "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
        "pages": "2818--2826",
        "title": "Rethinking the inception architecture for computer vision",
        "type": "inproceedings",
        "year": "2016"
    },
    "tan2019lxmert": {
        "author": "Tan, Hao and Bansal, Mohit",
        "journal": "arXiv preprint arXiv:1908.07490",
        "title": "Lxmert: Learning cross-modality encoder representations from transformers",
        "type": "article",
        "year": "2019"
    },
    "wang-etal-2023-diffusiondb": {
        "abstract": "\"With recent advancements in diffusion models, users can generate high-quality images by writing text prompts in natural language. However, generating images with desired details requires proper prompts, and it is often unclear how a model reacts to different prompts or what the best prompts are. To help researchers tackle these critical challenges, we introduce DiffusionDB, the first large-scale text-to-image prompt dataset totaling 6.5TB, containing 14 million images generated by Stable Diffusion, 1.8 million unique prompts, and hyperparameters specified by real users. We analyze the syntactic and semantic characteristics of prompts. We pinpoint specific hyperparameter values and prompt styles that can lead to model errors and present evidence of potentially harmful model usage, such as the generation of misinformation. The unprecedented scale and diversity of this human-actuated dataset provide exciting research opportunities in understanding the interplay between prompts and generative models, detecting deepfakes, and designing human-AI interaction tools to help users more easily use these models. DiffusionDB is publicly available at: \\url{https://poloclub.github.io/diffusiondb}.\",",
        "address": "\"Toronto, Canada\",",
        "author": "\"Wang, Zijie J.  and Montoya, Evan  and Munechika, David  and Yang, Haoyang  and Hoover, Benjamin  and Chau, Duen Horng\",",
        "booktitle": "\"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)\",",
        "doi": "\"10.18653/v1/2023.acl-long.51\",",
        "editor": "\"Rogers, Anna  and Boyd-Graber, Jordan  and Okazaki, Naoaki\",",
        "month": "jul,",
        "pages": "\"893--911\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{D}iffusion{DB}: A Large-scale Prompt Gallery Dataset for Text-to-Image Generative Models\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2023.acl-long.51\",",
        "year": "\"2023\","
    },
    "wang-gaizauskas-2015-generating": {
        "address": "\"Brighton, UK\",",
        "author": "\"Wang, Josiah  and Gaizauskas, Robert\",",
        "booktitle": "\"Proceedings of the 15th {E}uropean Workshop on Natural Language Generation ({ENLG})\",",
        "doi": "\"10.18653/v1/W15-4722\",",
        "editor": "\"Belz, Anya  and Gatt, Albert  and Portet, Fran{\\c{c}}ois  and Purver, Matthew\",",
        "month": "sep,",
        "pages": "\"117--126\",",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"Generating Image Descriptions with Gold Standard Visual Inputs: Motivation, Evaluation and Baselines\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/W15-4722\",",
        "year": "\"2015\","
    },
    "wang2003multiscale": {
        "author": "Wang, Zhou and Simoncelli, Eero P and Bovik, Alan C",
        "booktitle": "The Thrity-Seventh Asilomar Conference on Signals, Systems \\& Computers, 2003",
        "organization": "Ieee",
        "pages": "1398--1402",
        "title": "Multiscale structural similarity for image quality assessment",
        "type": "inproceedings",
        "volume": "2",
        "year": "2003"
    },
    "wang2004image": {
        "author": "Wang, Zhou and Bovik, Alan C and Sheikh, Hamid R and Simoncelli, Eero P",
        "journal": "IEEE transactions on image processing",
        "number": "4",
        "pages": "600--612",
        "publisher": "IEEE",
        "title": "Image quality assessment: from error visibility to structural similarity",
        "type": "article",
        "volume": "13",
        "year": "2004"
    },
    "wang2023clipiqa": {
        "author": "Wang, Jianyi and Chan, Kelvin CK and Loy, Chen Change",
        "booktitle": "Proceedings of the AAAI Conference on Artificial Intelligence",
        "number": "2",
        "pages": "2555--2563",
        "title": "Exploring clip for assessing the look and feel of images",
        "type": "inproceedings",
        "volume": "37",
        "year": "2023"
    },
    "wolff2023the": {
        "author": "Max Wolff and Wieland Brendel and Stuart Wolff",
        "booktitle": "ICLR 2023 Workshop on Mathematical and Empirical Understanding of Foundation Models",
        "title": "The Independent Compositional Subspace Hypothesis for the Structure of {CLIP}'s Last Layer",
        "type": "inproceedings",
        "url": "https://openreview.net/forum?id",
        "year": "2023"
    },
    "wu2022grit": {
        "author": "Wu, Jialian and Wang, Jianfeng and Yang, Zhengyuan and Gan, Zhe and Liu, Zicheng and Yuan, Junsong and Wang, Lijuan",
        "journal": "arXiv preprint arXiv:2212.00280",
        "title": "Grit: A generative region-to-text transformer for object understanding",
        "type": "article",
        "year": "2022"
    },
    "xie2019visual": {
        "archiveprefix": "arXiv",
        "author": "Ning Xie and Farley Lai and Derek Doran and Asim Kadav",
        "eprint": "1811.10582",
        "primaryclass": "cs.CV",
        "title": "Visual Entailment Task for Visually-Grounded Language Learning",
        "type": "misc",
        "year": "2019"
    },
    "xu-etal-2023-metarevision": {
        "address": "\"Singapore\",",
        "author": "\"Xu, Guangyue  and Kordjamshidi, Parisa  and Chai, Joyce\",",
        "booktitle": "\"Findings of the Association for Computational Linguistics: EMNLP 2023\",",
        "doi": "\"10.18653/v1/2023.findings-emnlp.818\",",
        "editor": "\"Bouamor, Houda  and Pino, Juan  and Bali, Kalika\",",
        "month": "dec,",
        "pages": "\"12224--12236\"",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"{M}eta{R}e{V}ision: Meta-Learning with Retrieval for Visually Grounded Compositional Concept Acquisition\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2023.findings-emnlp.818\",",
        "year": "\"2023\","
    },
    "xu2023imagereward": {
        "author": "Xu, Jiazheng and Liu, Xiao and Wu, Yuchen and Tong, Yuxuan and Li, Qinkai and Ding, Ming and Tang, Jie and Dong, Yuxiao",
        "journal": "arXiv preprint arXiv:2304.05977",
        "title": "Imagereward: Learning and evaluating human preferences for text-to-image generation",
        "type": "article",
        "year": "2023"
    },
    "yang2023diffusion": {
        "author": "Yang, Ling and Zhang, Zhilong and Song, Yang and Hong, Shenda and Xu, Runsheng and Zhao, Yue and Zhang, Wentao and Cui, Bin and Yang, Ming-Hsuan",
        "journal": "ACM Computing Surveys",
        "number": "4",
        "pages": "1--39",
        "publisher": "ACM New York, NY, USA",
        "title": "Diffusion models: A comprehensive survey of methods and applications",
        "type": "article",
        "volume": "56",
        "year": "2023"
    },
    "yarom2023seetrue": {
        "archiveprefix": "arXiv",
        "author": "Michal Yarom and Yonatan Bitton and Soravit Changpinyo and Roee Aharoni and Jonathan Herzig and Oran Lang and Eran Ofek and Idan Szpektor",
        "eprint": "2305.10400",
        "primaryclass": "cs.CL",
        "title": "What You See is What You Read? Improving Text-Image Alignment Evaluation",
        "type": "misc",
        "year": "2023"
    },
    "young2014flickr30k": {
        "abstract": "\"We propose to use the visual denotations of linguistic expressions (i.e. the set of images they describe) to define novel denotational similarity metrics, which we show to be at least as beneficial as distributional similarities for two tasks that require semantic inference. To compute these denotational similarities, we construct a denotation graph, i.e. a subsumption hierarchy over constituents and their denotations, based on a large corpus of 30K images and 150K descriptive captions.\",",
        "address": "\"Cambridge, MA\",",
        "author": "\"Young, Peter  and Lai, Alice  and Hodosh, Micah  and Hockenmaier, Julia\",",
        "doi": "\"10.1162/tacl_a_00166\",",
        "editor": "\"Lin, Dekang  and Collins, Michael  and Lee, Lillian\",",
        "journal": "\"Transactions of the Association for Computational Linguistics\",",
        "pages": "\"67--78\",",
        "publisher": "\"MIT Press\",",
        "title": "\"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions\",",
        "type": "article",
        "url": "\"https://aclanthology.org/Q14-1006\",",
        "volume": "\"2\",",
        "year": "\"2014\","
    },
    "young2014image": {
        "author": "Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia",
        "journal": "Transactions of the Association for Computational Linguistics",
        "pages": "67--78",
        "publisher": "MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~\u2026",
        "title": "From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions",
        "type": "article",
        "volume": "2",
        "year": "2014"
    },
    "yuksekgonul2022and": {
        "author": "Yuksekgonul, Mert and Bianchi, Federico and Kalluri, Pratyusha and Jurafsky, Dan and Zou, James",
        "booktitle": "The Eleventh International Conference on Learning Representations",
        "title": "When and Why Vision-Language Models Behave like Bags-Of-Words, and What to Do About It?",
        "type": "inproceedings",
        "year": "2022"
    },
    "zelaszczyk2024text": {
        "author": "{\\.Z}elaszczyk, Maciej and Ma{\\'n}dziuk, Jacek",
        "journal": "arXiv preprint arXiv:2401.11631",
        "title": "Text-to-Image Cross-Modal Generation: A Systematic Review",
        "type": "article",
        "year": "2024"
    },
    "zhang2011fsim": {
        "author": "Zhang, Lin and Zhang, Lei and Mou, Xuanqin and Zhang, David",
        "journal": "IEEE transactions on Image Processing",
        "number": "8",
        "pages": "2378--2386",
        "publisher": "IEEE",
        "title": "FSIM: A feature similarity index for image quality assessment",
        "type": "article",
        "volume": "20",
        "year": "2011"
    },
    "zhang2018unreasonable": {
        "author": "Zhang, Richard and Isola, Phillip and Efros, Alexei A and Shechtman, Eli and Wang, Oliver",
        "booktitle": "Proceedings of the IEEE conference on computer vision and pattern recognition",
        "pages": "586--595",
        "title": "The unreasonable effectiveness of deep features as a perceptual metric",
        "type": "inproceedings",
        "year": "2018"
    },
    "zhang2022perceptual": {
        "author": "Zhang, Lingzhi and Zhou, Yuqian and Barnes, Connelly and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo",
        "booktitle": "European Conference on Computer Vision",
        "organization": "Springer",
        "pages": "146--164",
        "title": "Perceptual artifacts localization for inpainting",
        "type": "inproceedings",
        "year": "2022"
    },
    "zhang2023perceptual": {
        "author": "Zhang, Lingzhi and Xu, Zhengjie and Barnes, Connelly and Zhou, Yuqian and Liu, Qing and Zhang, He and Amirghodsi, Sohrab and Lin, Zhe and Shechtman, Eli and Shi, Jianbo",
        "booktitle": "Proceedings of the IEEE/CVF International Conference on Computer Vision",
        "pages": "7579--7590",
        "title": "Perceptual Artifacts Localization for Image Synthesis Tasks",
        "type": "inproceedings",
        "year": "2023"
    },
    "zhang2023spot": {
        "author": "Zhang, Gengyuan and Bi, Jinhe and Gu, Jindong and Tresp, Volker",
        "journal": "arXiv preprint arXiv:2311.12919",
        "title": "SPOT! Revisiting Video-Language Models for Event Understanding",
        "type": "article",
        "year": "2023"
    },
    "zhang2023texttoimage": {
        "archiveprefix": "arXiv",
        "author": "Chenshuang Zhang and Chaoning Zhang and Mengchun Zhang and In So Kweon",
        "eprint": "2303.07909",
        "primaryclass": "cs.CV",
        "title": "Text-to-image Diffusion Models in Generative AI: A Survey",
        "type": "misc",
        "year": "2023"
    },
    "zhao-etal-2022-explainable": {
        "address": "\"Abu Dhabi, UAE\",",
        "author": "\"Zhao, Tiancheng  and Zhang, Tianqi  and Zhu, Mingwei  and Shen, Haozhan  and Lee, Kyusong  and Lu, Xiaopeng  and Yin, Jianwei\",",
        "booktitle": "\"Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations\",",
        "doi": "\"10.18653/v1/2022.emnlp-demos.4\",",
        "editor": "\"Che, Wanxiang  and Shutova, Ekaterina\",",
        "month": "dec,",
        "pages": "\"30--37\"",
        "publisher": "\"Association for Computational Linguistics\",",
        "title": "\"An Explainable Toolbox for Evaluating Pre-trained Vision-Language Models\",",
        "type": "inproceedings",
        "url": "\"https://aclanthology.org/2022.emnlp-demos.4\",",
        "year": "\"2022\","
    },
    "zhu2023contrastive": {
        "author": "Zhu, Xiangru and Sun, Penglei and Wang, Chengyu and Liu, Jingping and Li, Zhixu and Xiao, Yanghua and Huang, Jun",
        "journal": "arXiv preprint arXiv:2312.02338",
        "title": "A Contrastive Compositional Benchmark for Text-to-Image Synthesis: A Study with Unified Text-to-Image Fidelity Metrics",
        "type": "article",
        "year": "2023"
    }
};