ngocson2002 commited on
Commit
2ab3e8a
·
1 Parent(s): ca39bed

Update model

Browse files
Files changed (3) hide show
  1. config.json +356 -0
  2. model.safetensors +1 -1
  3. modeling_vivqa.py +13 -10
config.json CHANGED
@@ -21,8 +21,364 @@
21
  "encoder_layers": 4,
22
  "encoder_normalize_before": true,
23
  "fsdp": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  "img_size": 224,
25
  "in_chans": 3,
 
26
  "layernorm_embedding": false,
27
  "layernorm_eps": 1e-05,
28
  "max_rel_pos": 0,
 
21
  "encoder_layers": 4,
22
  "encoder_normalize_before": true,
23
  "fsdp": false,
24
+ "id2label": {
25
+ "0": "hai",
26
+ "1": "ba",
27
+ "2": "b\u1ed1n",
28
+ "3": "m\u00e0u tr\u1eafng",
29
+ "4": "m\u00e0u \u0111\u1ecf",
30
+ "5": "m\u00e0u xanh d\u01b0\u01a1ng",
31
+ "6": "m\u00e0u \u0111en",
32
+ "7": "m\u00e0u xanh l\u00e1",
33
+ "8": "ph\u00f2ng",
34
+ "9": "m\u00e0u v\u00e0ng",
35
+ "10": "ph\u00f2ng b\u1ebfp",
36
+ "11": "m\u00e0u n\u00e2u",
37
+ "12": "ph\u00f2ng t\u1eafm",
38
+ "13": "m\u00e0u cam",
39
+ "14": "gi\u01b0\u1eddng",
40
+ "15": "con m\u00e8o",
41
+ "16": "h\u01b0\u01a1u cao c\u1ed5",
42
+ "17": "m\u00e1y bay",
43
+ "18": "g\u01b0\u01a1ng",
44
+ "19": "n\u0103m",
45
+ "20": "con chim",
46
+ "21": "m\u00e0u x\u00e1m",
47
+ "22": "m\u00e0u t\u00eda",
48
+ "23": "con ch\u00f3",
49
+ "24": "con thuy\u1ec1n",
50
+ "25": "g\u1ea5u",
51
+ "26": "xe \u00f4 t\u00f4",
52
+ "27": "l\u1ecd c\u1eafm hoa",
53
+ "28": "con voi",
54
+ "29": "m\u1ed9t",
55
+ "30": "con ng\u1ef1a",
56
+ "31": "c\u00e1i gh\u1ebf",
57
+ "32": "xe m\u00e1y",
58
+ "33": "xe t\u1ea3i",
59
+ "34": "t\u00e0u h\u1ecfa",
60
+ "35": "xe bu\u00fdt",
61
+ "36": "\u0111\u01b0\u1eddng ph\u1ed1",
62
+ "37": "ch\u1eadu",
63
+ "38": "h\u1ed9p",
64
+ "39": "b\u00e1t",
65
+ "40": "pizza",
66
+ "41": "xe \u0111\u1ea1p",
67
+ "42": "chu\u1ed3ng",
68
+ "43": "con b\u00f2",
69
+ "44": "vali",
70
+ "45": "b\u00e1nh",
71
+ "46": "\u0111\u1ed3ng h\u1ed3",
72
+ "47": "s\u00e1u",
73
+ "48": "di\u1ec1u",
74
+ "49": "b\u0103ng gh\u1ebf",
75
+ "50": "donut",
76
+ "51": "nh\u00e0 v\u1ec7 sinh",
77
+ "52": "l\u00f2 vi s\u00f3ng",
78
+ "53": "sandwich",
79
+ "54": "ng\u1ef1a v\u1eb1n",
80
+ "55": "tr\u1ea1m",
81
+ "56": "chi\u1ebfc \u00f4",
82
+ "57": "ph\u00f2ng ng\u1ee7",
83
+ "58": "ng\u1ef1a r\u1eb1n",
84
+ "59": "\u0111\u0129a \u0103n",
85
+ "60": "v\u00f2i",
86
+ "61": "\u0111i\u1ec7n tho\u1ea1i",
87
+ "62": "con c\u1eebu",
88
+ "63": "t\u00f2a nh\u00e0",
89
+ "64": "v\u00e1n tr\u01b0\u1ee3t",
90
+ "65": "c\u1eeda s\u1ed5",
91
+ "66": "c\u1eeda h\u00e0ng",
92
+ "67": "t\u00f2a th\u00e1p",
93
+ "68": "b\u1ed3n t\u1eafm",
94
+ "69": "c\u00e1i r\u1ed5",
95
+ "70": "c\u00e2y",
96
+ "71": "m\u00e1y vi t\u00ednh",
97
+ "72": "qu\u00e1n \u0103n",
98
+ "73": "ga ra",
99
+ "74": "ch\u1ea3o",
100
+ "75": "v\u01b0\u1eddn b\u00e1ch th\u00fa",
101
+ "76": "nh\u00e0 \u1edf",
102
+ "77": "xe \u0111\u1ea9y",
103
+ "78": "laptop",
104
+ "79": "xe l\u1eeda",
105
+ "80": "b\u00f4ng hoa",
106
+ "81": "v\u00e1n l\u01b0\u1edbt s\u00f3ng",
107
+ "82": "c\u00e1i t\u00fai",
108
+ "83": "t\u1ee7 \u0111\u00e1",
109
+ "84": "qu\u1ea3 b\u00f3ng",
110
+ "85": "chu\u1ed1i",
111
+ "86": "s\u00e2n bay",
112
+ "87": "v\u0103n ph\u00f2ng",
113
+ "88": "th\u00f9ng ch\u1ee9a",
114
+ "89": "n\u00fai",
115
+ "90": "c\u00e1i b\u00e0n",
116
+ "91": "tr\u01b0\u1ee3t tuy\u1ebft",
117
+ "92": "c\u00e0 v\u1ea1t",
118
+ "93": "h\u1ed3 b\u01a1i",
119
+ "94": "b\u00e3i c\u1ecf",
120
+ "95": "b\u1ea3y",
121
+ "96": "m\u00f3n \u0103n",
122
+ "97": "\u0111\u01b0\u1eddng b\u1ed9",
123
+ "98": "xe",
124
+ "99": "n\u00f3n",
125
+ "100": "\u0111\u1ed9ng c\u01a1",
126
+ "101": "c\u00e1i m\u00e2m",
127
+ "102": "g\u1eady",
128
+ "103": "g\u1ea5u tr\u00fac",
129
+ "104": "c\u1eeda ti\u1ec7m",
130
+ "105": "con v\u1ecbt",
131
+ "106": "l\u1ed3ng",
132
+ "107": "t\u01b0\u1eddng",
133
+ "108": "c\u00e1i n\u1ed3i",
134
+ "109": "t\u1ee7 l\u1ea1nh",
135
+ "110": "c\u1eeda",
136
+ "111": "t\u00e1ch",
137
+ "112": "b\u1ee9c \u1ea3nh",
138
+ "113": "s\u00e2n v\u01b0\u1eddn",
139
+ "114": "\u0111\u1ed3i",
140
+ "115": "b\u1eefa \u0103n",
141
+ "116": "s\u00e2n v\u1eadn \u0111\u1ed9ng",
142
+ "117": "d\u0129a nh\u1ef1a",
143
+ "118": "ph\u01b0\u01a1ng ti\u1ec7n giao th\u00f4ng",
144
+ "119": "m\u00e1y xay",
145
+ "120": "\u0111\u1ed3 ch\u01a1i",
146
+ "121": "m\u0169",
147
+ "122": "rau",
148
+ "123": "\u00e1o vest",
149
+ "124": "v\u00f2i hoa sen",
150
+ "125": "b\u00e0n ch\u1ea3i",
151
+ "126": "c\u00e1i k\u1ec7",
152
+ "127": "\u0111\u01b0\u1eddng",
153
+ "128": "xe l\u0103n",
154
+ "129": "c\u00e0 r\u1ed1t",
155
+ "130": "xe c\u1ed9",
156
+ "131": "th\u00e2n c\u00e2y",
157
+ "132": "m\u00e1y \u1ea3nh",
158
+ "133": "chai",
159
+ "134": "\u00f4 c\u1eeda",
160
+ "135": "s\u00e2n",
161
+ "136": "b\u1ebfn du thuy\u1ec1n",
162
+ "137": "dao",
163
+ "138": "xe tay ga",
164
+ "139": "qu\u00e1n bar",
165
+ "140": "th\u01b0 vi\u1ec7n",
166
+ "141": "h\u00e0nh l\u00fd",
167
+ "142": "b\u1edd bi\u1ec3n",
168
+ "143": "t\u00e1m",
169
+ "144": "c\u00e1i l\u1ecd",
170
+ "145": "m\u1eb7t tr\u1eddi",
171
+ "146": "\u00e1o s\u01a1 mi",
172
+ "147": "qu\u1ea7y t\u00ednh ti\u1ec1n",
173
+ "148": "\u0111\u01b0\u1eddng s\u1eaft",
174
+ "149": "b\u1ea7u tr\u1eddi",
175
+ "150": "chu\u1ed9t",
176
+ "151": "r\u00e0o ch\u1eafn",
177
+ "152": "\u1ea3nh ch\u1ee5p",
178
+ "153": "balo",
179
+ "154": "b\u1ea3o t\u00e0ng",
180
+ "155": "qu\u1ea3 t\u00e1o",
181
+ "156": "hoa qu\u1ea3",
182
+ "157": "b\u1ee9c t\u01b0\u1ee3ng",
183
+ "158": "m\u00e1y t\u00ednh",
184
+ "159": "c\u00e1c t\u00f2a nh\u00e0",
185
+ "160": "ch\u00e9n \u0111\u0129a",
186
+ "161": "m\u01b0\u1eddi",
187
+ "162": "ch\u00edn",
188
+ "163": "gi\u1ea5y b\u1ea1c",
189
+ "164": "s\u00e0n nh\u00e0",
190
+ "165": "chu\u1ed3ng tr\u1ea1i",
191
+ "166": "l\u1edbp h\u1ecdc",
192
+ "167": "kho",
193
+ "168": "b\u1ebfp",
194
+ "169": "b\u1ea3ng",
195
+ "170": "gia s\u00fac",
196
+ "171": "th\u1ecbt",
197
+ "172": "b\u1ed3n ti\u1ec3u",
198
+ "173": "t\u1ea1p d\u1ec1",
199
+ "174": "c\u00e1i l\u1ec1u",
200
+ "175": "g\u0103ng tay",
201
+ "176": "h\u00e0nh lang",
202
+ "177": "l\u00e1",
203
+ "178": "t\u00fai",
204
+ "179": "h\u1ea3i \u00e2u",
205
+ "180": "v\u1ee3t",
206
+ "181": "b\u00e0n ph\u00edm",
207
+ "182": "s\u00f4 c\u00f4 la",
208
+ "183": "r\u01b0\u1ee3u",
209
+ "184": "t\u00e1o",
210
+ "185": "gian h\u00e0ng",
211
+ "186": "xe \u0111i\u1ec7n ng\u1ea7m",
212
+ "187": "m\u00e1y s\u1ea5y kh\u00f4",
213
+ "188": "toa xe",
214
+ "189": "trang thi\u1ebft b\u1ecb",
215
+ "190": "c\u1ed7 m\u00e1y",
216
+ "191": "n\u01b0\u1edbc",
217
+ "192": "c\u00e2y k\u00e9o",
218
+ "193": "ng\u0103n k\u00e9o",
219
+ "194": "v\u1ea1ch k\u1ebb \u0111\u01b0\u1eddng",
220
+ "195": "b\u00e1nh ng\u1ecdt",
221
+ "196": "l\u1ed1i \u0111i",
222
+ "197": "t\u00e0u",
223
+ "198": "\u0111\u01b0\u1eddng \u0111i b\u1ed9",
224
+ "199": "d\u0129a",
225
+ "200": "con v\u1eb9t",
226
+ "201": "l\u00e1 c\u1edd",
227
+ "202": "kh\u0103n",
228
+ "203": "chung c\u01b0",
229
+ "204": "h\u1ed3",
230
+ "205": "ca n\u00f4",
231
+ "206": "gi\u00e1 \u0111\u1ee1",
232
+ "207": "nh\u1eefng qu\u1ea3 cam",
233
+ "208": "b\u1eefa tr\u01b0a",
234
+ "209": "k\u00ednh \u0111eo",
235
+ "210": "cupcake",
236
+ "211": "\u0111\u01b0\u1eddng ray",
237
+ "212": "b\u1ed9 \u0111\u1ed3",
238
+ "213": "h\u00e0ng ho\u00e1",
239
+ "214": "nh\u1eefng b\u1ee9c \u1ea3nh",
240
+ "215": "c\u00e1i v\u00ed",
241
+ "216": "c\u1eebu",
242
+ "217": "ng\u01b0\u1eddi gi\u1eef",
243
+ "218": "b\u1ee9c tranh",
244
+ "219": "c\u1ea7u",
245
+ "220": "nhi\u1ec1u c\u00e1i gh\u1ebf",
246
+ "221": "b\u00f4ng c\u1ea3i xanh",
247
+ "222": "b\u1eefa \u0103n t\u1ed1i",
248
+ "223": "v\u1ebd tranh l\u00ean t\u01b0\u1eddng",
249
+ "224": "thuy\u1ec1n bu\u1ed3m",
250
+ "225": "\u0111i v\u0103ng",
251
+ "226": "s\u00e2n kh\u1ea5u",
252
+ "227": "n\u1ebfn",
253
+ "228": "bu\u1ed3ng",
254
+ "229": "c\u00e1i th\u00eca",
255
+ "230": "c\u1ecf kh\u00f4",
256
+ "231": "con kh\u1ec9",
257
+ "232": "t\u01b0\u1ee3ng \u0111\u00e0i",
258
+ "233": "t\u1ee7 \u0111\u00f4ng",
259
+ "234": "hoa h\u1ed3ng",
260
+ "235": "chim b\u1ed3 c\u00e2u",
261
+ "236": "hay",
262
+ "237": "g\u1ea7u m\u00fac",
263
+ "238": "b\u00fai t\u00f3c",
264
+ "239": "m\u00f3ng vu\u1ed1t",
265
+ "240": "xe \u0111i\u1ec7n",
266
+ "241": "\u0111\u0129a",
267
+ "242": "m\u00e0n",
268
+ "243": "\u00e1o kho\u00e1c",
269
+ "244": "m\u1eb7t n\u1ea1",
270
+ "245": "\u0111\u1ed3 u\u1ed1ng",
271
+ "246": "b\u00f2 \u0111\u1ef1c",
272
+ "247": "c\u00e1i n\u0129a",
273
+ "248": "\u0111\u01b0\u1eddng \u1ed1ng",
274
+ "249": "n\u01b0\u1edbc ti\u1ec3u",
275
+ "250": "ly",
276
+ "251": "\u0111\u00e8n \u0111\u1ec3 b\u00e0n",
277
+ "252": "\u0111\u1ed3 n\u1ed9i th\u1ea5t",
278
+ "253": "m\u00e1i ch\u00e8o",
279
+ "254": "\u0111\u1ea7u m\u00e1y",
280
+ "255": "\u0111\u1ea7m",
281
+ "256": "m\u0169 l\u01b0\u1ee1i trai",
282
+ "257": "truy\u1ec1n h\u00ecnh",
283
+ "258": "ph\u00f4 mai",
284
+ "259": "c\u00e0 ph\u00ea",
285
+ "260": "b\u1ebfn t\u00e0u",
286
+ "261": "con d\u00ea",
287
+ "262": "c\u1eeda ra v\u00e0o",
288
+ "263": "k\u00fd t\u00ean",
289
+ "264": "thi\u1ebft b\u1ecb",
290
+ "265": "b\u00ecnh hoa",
291
+ "266": "bia",
292
+ "267": "con d\u1ed1c",
293
+ "268": "\u00e1o cho\u00e0ng",
294
+ "269": "m\u00f3n tr\u00e1ng mi\u1ec7ng",
295
+ "270": "c\u00e2y s\u00e0o",
296
+ "271": "thu\u1ed1c l\u00e1",
297
+ "272": "m\u1eb7t",
298
+ "273": "k\u00ednh r\u00e2m",
299
+ "274": "\u0111i\u00eau kh\u1eafc",
300
+ "275": "nh\u00e0",
301
+ "276": "rau qu\u1ea3",
302
+ "277": "tr\u00e1i c\u00e2y",
303
+ "278": "qu\u1ea3 cam",
304
+ "279": "\u0111\u0129a n\u00e9m",
305
+ "280": "ba lan",
306
+ "281": "c\u00e2y g\u1eady",
307
+ "282": "s\u1eefa",
308
+ "283": "h\u1ed9p \u0111\u1ef1ng",
309
+ "284": "khung",
310
+ "285": "ngo\u00e0i tr\u1eddi",
311
+ "286": "\u0111o\u1ea1n phim gi\u1edbi thi\u1ec7u",
312
+ "287": "c\u1edd",
313
+ "288": "th\u00f9ng",
314
+ "289": "l\u00f2 s\u01b0\u1edfi",
315
+ "290": "l\u00e1t c\u1eaft",
316
+ "291": "b\u1eafp ch\u00e2n",
317
+ "292": "c\u00fan y\u00eau",
318
+ "293": "ng\u00e2n h\u00e0ng",
319
+ "294": "rau x\u00e0 l\u00e1ch",
320
+ "295": "xa l\u1ed9",
321
+ "296": "g\u00e0",
322
+ "297": "qu\u1ea7n short",
323
+ "298": "v\u00f2i n\u01b0\u1edbc",
324
+ "299": "m\u0169 b\u1ea3o hi\u1ec3m",
325
+ "300": "c\u00f4ng c\u1ee5",
326
+ "301": "qu\u1ea3 cam ",
327
+ "302": "v\u00e1n tr\u01b0\u1ee3t tuy\u1ebft",
328
+ "303": "g\u1ea1ch",
329
+ "304": "ch\u00ecm xu\u1ed1ng",
330
+ "305": "kh\u0103n t\u1eafm",
331
+ "306": "l\u00e1t g\u1ea1ch",
332
+ "307": "ng\u0103n",
333
+ "308": "b\u1ea3ng hi\u1ec7u",
334
+ "309": "l\u0103n tr\u00f2n",
335
+ "310": "hotdog",
336
+ "311": "c\u1ecf",
337
+ "312": "b\u00ecnh",
338
+ "313": "b\u00ean",
339
+ "314": "t\u00e0u ho\u1ea3",
340
+ "315": "b\u00e1nh xe",
341
+ "316": "lon",
342
+ "317": "nh\u00e0 t\u1eafm",
343
+ "318": "\u0111\u01b0\u1eddng \u0111ua",
344
+ "319": "m\u00e0u s\u1eafc",
345
+ "320": "bao b\u00ec",
346
+ "321": "th\u00e0nh ph\u1ea7n",
347
+ "322": "chim \u01b0ng",
348
+ "323": "\u0111i\u1ec3m t\u00e2m",
349
+ "324": "d\u0129a ",
350
+ "325": "b\u00e0n ch\u1ea3i \u0111\u00e1nh r\u0103ng",
351
+ "326": "h\u00e0ng h\u00f3a",
352
+ "327": "pug",
353
+ "328": "h\u1ed9p s\u1ed1",
354
+ "329": "c\u00e1",
355
+ "330": "gi\u1ecf",
356
+ "331": "gh\u1ebf s\u00f4 pha",
357
+ "332": "qu\u1ea7n \u00e1o",
358
+ "333": "tr\u01b0\u1eddng h\u1ee3p",
359
+ "334": "b\u00f2",
360
+ "335": "v\u00f4 tuy\u1ebfn",
361
+ "336": "con thoi",
362
+ "337": "theo d\u00f5i",
363
+ "338": "\u00e1o ba l\u1ed7",
364
+ "339": "d\u00f2ng s\u00f4ng",
365
+ "340": "g\u00e0 t\u00e2y",
366
+ "341": "d\u1ea5u hi\u1ec7u",
367
+ "342": "m\u00e8o con",
368
+ "343": "m\u1eaft",
369
+ "344": "\u0111\u01b0a \u0111\u00f3n",
370
+ "345": "con heo",
371
+ "346": "ngo\u00e0i",
372
+ "347": "\u0111\u1ed3ng ph\u1ee5c",
373
+ "348": "m\u00e1y bay tr\u1ef1c th\u0103ng",
374
+ "349": "\u0111\u1ea1i d\u01b0\u01a1ng",
375
+ "350": "b\u1ee9c m\u00e0n",
376
+ "351": "cam",
377
+ "352": "b\u00e1nh hamburger"
378
+ },
379
  "img_size": 224,
380
  "in_chans": 3,
381
+ "label2id": null,
382
  "layernorm_embedding": false,
383
  "layernorm_eps": 1e-05,
384
  "max_rel_pos": 0,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06d19ca8565c6ca7b5717df05fd5490768bf2d73e27f4b662fbd9ae120ca71e1
3
  size 4911305908
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350969d1b1809558a103e887928ed65b68950e1d33edaff47e549c923b5b7691
3
  size 4911305908
modeling_vivqa.py CHANGED
@@ -38,10 +38,12 @@ class Blip2EfficientExtractor(nn.Module):
38
 
39
  # Efficientnet
40
  self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
 
41
  self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
42
  self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
43
 
44
- def forward(self, images):
 
45
  global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
46
 
47
  local_features = self.model_efficient.extract_features(images)
@@ -111,18 +113,19 @@ class ViVQABEiT3(PreTrainedModel):
111
  x1 = self.vision_embed(visual_tokens)
112
  multiway_split_position = x1.size(1)
113
 
114
- x2 = self.text_embed(textual_tokens, text_padding_position)
115
  x2 = self.linear(x2)
116
 
117
  x = torch.cat([x1, x2], dim=1)
118
- if text_padding_position is not None:
119
- encoder_padding_mask = torch.cat(
120
- [
121
- torch.zeros(x1.shape[:-1]).to(x1.device).bool(),
122
- text_padding_position,
123
- ],
124
- dim=1,
125
- )
 
126
  encoder_out = self.encoder(
127
  src_tokens=None,
128
  encoder_padding_mask=encoder_padding_mask,
 
38
 
39
  # Efficientnet
40
  self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
41
+ self.model_efficient.eval()
42
  self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
43
  self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
44
 
45
+ def forward(self, images):
46
+
47
  global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
48
 
49
  local_features = self.model_efficient.extract_features(images)
 
113
  x1 = self.vision_embed(visual_tokens)
114
  multiway_split_position = x1.size(1)
115
 
116
+ x2 = self.text_embed(textual_tokens, 1-text_padding_position)
117
  x2 = self.linear(x2)
118
 
119
  x = torch.cat([x1, x2], dim=1)
120
+
121
+ encoder_padding_mask = torch.cat(
122
+ [
123
+ torch.zeros(x1.shape[:-1]).to(x1.device).bool(),
124
+ text_padding_position,
125
+ ],
126
+ dim=1,
127
+ )
128
+
129
  encoder_out = self.encoder(
130
  src_tokens=None,
131
  encoder_padding_mask=encoder_padding_mask,