Commit
·
2ab3e8a
1
Parent(s):
ca39bed
Update model
Browse files- config.json +356 -0
- model.safetensors +1 -1
- modeling_vivqa.py +13 -10
config.json
CHANGED
@@ -21,8 +21,364 @@
|
|
21 |
"encoder_layers": 4,
|
22 |
"encoder_normalize_before": true,
|
23 |
"fsdp": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
"img_size": 224,
|
25 |
"in_chans": 3,
|
|
|
26 |
"layernorm_embedding": false,
|
27 |
"layernorm_eps": 1e-05,
|
28 |
"max_rel_pos": 0,
|
|
|
21 |
"encoder_layers": 4,
|
22 |
"encoder_normalize_before": true,
|
23 |
"fsdp": false,
|
24 |
+
"id2label": {
|
25 |
+
"0": "hai",
|
26 |
+
"1": "ba",
|
27 |
+
"2": "b\u1ed1n",
|
28 |
+
"3": "m\u00e0u tr\u1eafng",
|
29 |
+
"4": "m\u00e0u \u0111\u1ecf",
|
30 |
+
"5": "m\u00e0u xanh d\u01b0\u01a1ng",
|
31 |
+
"6": "m\u00e0u \u0111en",
|
32 |
+
"7": "m\u00e0u xanh l\u00e1",
|
33 |
+
"8": "ph\u00f2ng",
|
34 |
+
"9": "m\u00e0u v\u00e0ng",
|
35 |
+
"10": "ph\u00f2ng b\u1ebfp",
|
36 |
+
"11": "m\u00e0u n\u00e2u",
|
37 |
+
"12": "ph\u00f2ng t\u1eafm",
|
38 |
+
"13": "m\u00e0u cam",
|
39 |
+
"14": "gi\u01b0\u1eddng",
|
40 |
+
"15": "con m\u00e8o",
|
41 |
+
"16": "h\u01b0\u01a1u cao c\u1ed5",
|
42 |
+
"17": "m\u00e1y bay",
|
43 |
+
"18": "g\u01b0\u01a1ng",
|
44 |
+
"19": "n\u0103m",
|
45 |
+
"20": "con chim",
|
46 |
+
"21": "m\u00e0u x\u00e1m",
|
47 |
+
"22": "m\u00e0u t\u00eda",
|
48 |
+
"23": "con ch\u00f3",
|
49 |
+
"24": "con thuy\u1ec1n",
|
50 |
+
"25": "g\u1ea5u",
|
51 |
+
"26": "xe \u00f4 t\u00f4",
|
52 |
+
"27": "l\u1ecd c\u1eafm hoa",
|
53 |
+
"28": "con voi",
|
54 |
+
"29": "m\u1ed9t",
|
55 |
+
"30": "con ng\u1ef1a",
|
56 |
+
"31": "c\u00e1i gh\u1ebf",
|
57 |
+
"32": "xe m\u00e1y",
|
58 |
+
"33": "xe t\u1ea3i",
|
59 |
+
"34": "t\u00e0u h\u1ecfa",
|
60 |
+
"35": "xe bu\u00fdt",
|
61 |
+
"36": "\u0111\u01b0\u1eddng ph\u1ed1",
|
62 |
+
"37": "ch\u1eadu",
|
63 |
+
"38": "h\u1ed9p",
|
64 |
+
"39": "b\u00e1t",
|
65 |
+
"40": "pizza",
|
66 |
+
"41": "xe \u0111\u1ea1p",
|
67 |
+
"42": "chu\u1ed3ng",
|
68 |
+
"43": "con b\u00f2",
|
69 |
+
"44": "vali",
|
70 |
+
"45": "b\u00e1nh",
|
71 |
+
"46": "\u0111\u1ed3ng h\u1ed3",
|
72 |
+
"47": "s\u00e1u",
|
73 |
+
"48": "di\u1ec1u",
|
74 |
+
"49": "b\u0103ng gh\u1ebf",
|
75 |
+
"50": "donut",
|
76 |
+
"51": "nh\u00e0 v\u1ec7 sinh",
|
77 |
+
"52": "l\u00f2 vi s\u00f3ng",
|
78 |
+
"53": "sandwich",
|
79 |
+
"54": "ng\u1ef1a v\u1eb1n",
|
80 |
+
"55": "tr\u1ea1m",
|
81 |
+
"56": "chi\u1ebfc \u00f4",
|
82 |
+
"57": "ph\u00f2ng ng\u1ee7",
|
83 |
+
"58": "ng\u1ef1a r\u1eb1n",
|
84 |
+
"59": "\u0111\u0129a \u0103n",
|
85 |
+
"60": "v\u00f2i",
|
86 |
+
"61": "\u0111i\u1ec7n tho\u1ea1i",
|
87 |
+
"62": "con c\u1eebu",
|
88 |
+
"63": "t\u00f2a nh\u00e0",
|
89 |
+
"64": "v\u00e1n tr\u01b0\u1ee3t",
|
90 |
+
"65": "c\u1eeda s\u1ed5",
|
91 |
+
"66": "c\u1eeda h\u00e0ng",
|
92 |
+
"67": "t\u00f2a th\u00e1p",
|
93 |
+
"68": "b\u1ed3n t\u1eafm",
|
94 |
+
"69": "c\u00e1i r\u1ed5",
|
95 |
+
"70": "c\u00e2y",
|
96 |
+
"71": "m\u00e1y vi t\u00ednh",
|
97 |
+
"72": "qu\u00e1n \u0103n",
|
98 |
+
"73": "ga ra",
|
99 |
+
"74": "ch\u1ea3o",
|
100 |
+
"75": "v\u01b0\u1eddn b\u00e1ch th\u00fa",
|
101 |
+
"76": "nh\u00e0 \u1edf",
|
102 |
+
"77": "xe \u0111\u1ea9y",
|
103 |
+
"78": "laptop",
|
104 |
+
"79": "xe l\u1eeda",
|
105 |
+
"80": "b\u00f4ng hoa",
|
106 |
+
"81": "v\u00e1n l\u01b0\u1edbt s\u00f3ng",
|
107 |
+
"82": "c\u00e1i t\u00fai",
|
108 |
+
"83": "t\u1ee7 \u0111\u00e1",
|
109 |
+
"84": "qu\u1ea3 b\u00f3ng",
|
110 |
+
"85": "chu\u1ed1i",
|
111 |
+
"86": "s\u00e2n bay",
|
112 |
+
"87": "v\u0103n ph\u00f2ng",
|
113 |
+
"88": "th\u00f9ng ch\u1ee9a",
|
114 |
+
"89": "n\u00fai",
|
115 |
+
"90": "c\u00e1i b\u00e0n",
|
116 |
+
"91": "tr\u01b0\u1ee3t tuy\u1ebft",
|
117 |
+
"92": "c\u00e0 v\u1ea1t",
|
118 |
+
"93": "h\u1ed3 b\u01a1i",
|
119 |
+
"94": "b\u00e3i c\u1ecf",
|
120 |
+
"95": "b\u1ea3y",
|
121 |
+
"96": "m\u00f3n \u0103n",
|
122 |
+
"97": "\u0111\u01b0\u1eddng b\u1ed9",
|
123 |
+
"98": "xe",
|
124 |
+
"99": "n\u00f3n",
|
125 |
+
"100": "\u0111\u1ed9ng c\u01a1",
|
126 |
+
"101": "c\u00e1i m\u00e2m",
|
127 |
+
"102": "g\u1eady",
|
128 |
+
"103": "g\u1ea5u tr\u00fac",
|
129 |
+
"104": "c\u1eeda ti\u1ec7m",
|
130 |
+
"105": "con v\u1ecbt",
|
131 |
+
"106": "l\u1ed3ng",
|
132 |
+
"107": "t\u01b0\u1eddng",
|
133 |
+
"108": "c\u00e1i n\u1ed3i",
|
134 |
+
"109": "t\u1ee7 l\u1ea1nh",
|
135 |
+
"110": "c\u1eeda",
|
136 |
+
"111": "t\u00e1ch",
|
137 |
+
"112": "b\u1ee9c \u1ea3nh",
|
138 |
+
"113": "s\u00e2n v\u01b0\u1eddn",
|
139 |
+
"114": "\u0111\u1ed3i",
|
140 |
+
"115": "b\u1eefa \u0103n",
|
141 |
+
"116": "s\u00e2n v\u1eadn \u0111\u1ed9ng",
|
142 |
+
"117": "d\u0129a nh\u1ef1a",
|
143 |
+
"118": "ph\u01b0\u01a1ng ti\u1ec7n giao th\u00f4ng",
|
144 |
+
"119": "m\u00e1y xay",
|
145 |
+
"120": "\u0111\u1ed3 ch\u01a1i",
|
146 |
+
"121": "m\u0169",
|
147 |
+
"122": "rau",
|
148 |
+
"123": "\u00e1o vest",
|
149 |
+
"124": "v\u00f2i hoa sen",
|
150 |
+
"125": "b\u00e0n ch\u1ea3i",
|
151 |
+
"126": "c\u00e1i k\u1ec7",
|
152 |
+
"127": "\u0111\u01b0\u1eddng",
|
153 |
+
"128": "xe l\u0103n",
|
154 |
+
"129": "c\u00e0 r\u1ed1t",
|
155 |
+
"130": "xe c\u1ed9",
|
156 |
+
"131": "th\u00e2n c\u00e2y",
|
157 |
+
"132": "m\u00e1y \u1ea3nh",
|
158 |
+
"133": "chai",
|
159 |
+
"134": "\u00f4 c\u1eeda",
|
160 |
+
"135": "s\u00e2n",
|
161 |
+
"136": "b\u1ebfn du thuy\u1ec1n",
|
162 |
+
"137": "dao",
|
163 |
+
"138": "xe tay ga",
|
164 |
+
"139": "qu\u00e1n bar",
|
165 |
+
"140": "th\u01b0 vi\u1ec7n",
|
166 |
+
"141": "h\u00e0nh l\u00fd",
|
167 |
+
"142": "b\u1edd bi\u1ec3n",
|
168 |
+
"143": "t\u00e1m",
|
169 |
+
"144": "c\u00e1i l\u1ecd",
|
170 |
+
"145": "m\u1eb7t tr\u1eddi",
|
171 |
+
"146": "\u00e1o s\u01a1 mi",
|
172 |
+
"147": "qu\u1ea7y t\u00ednh ti\u1ec1n",
|
173 |
+
"148": "\u0111\u01b0\u1eddng s\u1eaft",
|
174 |
+
"149": "b\u1ea7u tr\u1eddi",
|
175 |
+
"150": "chu\u1ed9t",
|
176 |
+
"151": "r\u00e0o ch\u1eafn",
|
177 |
+
"152": "\u1ea3nh ch\u1ee5p",
|
178 |
+
"153": "balo",
|
179 |
+
"154": "b\u1ea3o t\u00e0ng",
|
180 |
+
"155": "qu\u1ea3 t\u00e1o",
|
181 |
+
"156": "hoa qu\u1ea3",
|
182 |
+
"157": "b\u1ee9c t\u01b0\u1ee3ng",
|
183 |
+
"158": "m\u00e1y t\u00ednh",
|
184 |
+
"159": "c\u00e1c t\u00f2a nh\u00e0",
|
185 |
+
"160": "ch\u00e9n \u0111\u0129a",
|
186 |
+
"161": "m\u01b0\u1eddi",
|
187 |
+
"162": "ch\u00edn",
|
188 |
+
"163": "gi\u1ea5y b\u1ea1c",
|
189 |
+
"164": "s\u00e0n nh\u00e0",
|
190 |
+
"165": "chu\u1ed3ng tr\u1ea1i",
|
191 |
+
"166": "l\u1edbp h\u1ecdc",
|
192 |
+
"167": "kho",
|
193 |
+
"168": "b\u1ebfp",
|
194 |
+
"169": "b\u1ea3ng",
|
195 |
+
"170": "gia s\u00fac",
|
196 |
+
"171": "th\u1ecbt",
|
197 |
+
"172": "b\u1ed3n ti\u1ec3u",
|
198 |
+
"173": "t\u1ea1p d\u1ec1",
|
199 |
+
"174": "c\u00e1i l\u1ec1u",
|
200 |
+
"175": "g\u0103ng tay",
|
201 |
+
"176": "h\u00e0nh lang",
|
202 |
+
"177": "l\u00e1",
|
203 |
+
"178": "t\u00fai",
|
204 |
+
"179": "h\u1ea3i \u00e2u",
|
205 |
+
"180": "v\u1ee3t",
|
206 |
+
"181": "b\u00e0n ph\u00edm",
|
207 |
+
"182": "s\u00f4 c\u00f4 la",
|
208 |
+
"183": "r\u01b0\u1ee3u",
|
209 |
+
"184": "t\u00e1o",
|
210 |
+
"185": "gian h\u00e0ng",
|
211 |
+
"186": "xe \u0111i\u1ec7n ng\u1ea7m",
|
212 |
+
"187": "m\u00e1y s\u1ea5y kh\u00f4",
|
213 |
+
"188": "toa xe",
|
214 |
+
"189": "trang thi\u1ebft b\u1ecb",
|
215 |
+
"190": "c\u1ed7 m\u00e1y",
|
216 |
+
"191": "n\u01b0\u1edbc",
|
217 |
+
"192": "c\u00e2y k\u00e9o",
|
218 |
+
"193": "ng\u0103n k\u00e9o",
|
219 |
+
"194": "v\u1ea1ch k\u1ebb \u0111\u01b0\u1eddng",
|
220 |
+
"195": "b\u00e1nh ng\u1ecdt",
|
221 |
+
"196": "l\u1ed1i \u0111i",
|
222 |
+
"197": "t\u00e0u",
|
223 |
+
"198": "\u0111\u01b0\u1eddng \u0111i b\u1ed9",
|
224 |
+
"199": "d\u0129a",
|
225 |
+
"200": "con v\u1eb9t",
|
226 |
+
"201": "l\u00e1 c\u1edd",
|
227 |
+
"202": "kh\u0103n",
|
228 |
+
"203": "chung c\u01b0",
|
229 |
+
"204": "h\u1ed3",
|
230 |
+
"205": "ca n\u00f4",
|
231 |
+
"206": "gi\u00e1 \u0111\u1ee1",
|
232 |
+
"207": "nh\u1eefng qu\u1ea3 cam",
|
233 |
+
"208": "b\u1eefa tr\u01b0a",
|
234 |
+
"209": "k\u00ednh \u0111eo",
|
235 |
+
"210": "cupcake",
|
236 |
+
"211": "\u0111\u01b0\u1eddng ray",
|
237 |
+
"212": "b\u1ed9 \u0111\u1ed3",
|
238 |
+
"213": "h\u00e0ng ho\u00e1",
|
239 |
+
"214": "nh\u1eefng b\u1ee9c \u1ea3nh",
|
240 |
+
"215": "c\u00e1i v\u00ed",
|
241 |
+
"216": "c\u1eebu",
|
242 |
+
"217": "ng\u01b0\u1eddi gi\u1eef",
|
243 |
+
"218": "b\u1ee9c tranh",
|
244 |
+
"219": "c\u1ea7u",
|
245 |
+
"220": "nhi\u1ec1u c\u00e1i gh\u1ebf",
|
246 |
+
"221": "b\u00f4ng c\u1ea3i xanh",
|
247 |
+
"222": "b\u1eefa \u0103n t\u1ed1i",
|
248 |
+
"223": "v\u1ebd tranh l\u00ean t\u01b0\u1eddng",
|
249 |
+
"224": "thuy\u1ec1n bu\u1ed3m",
|
250 |
+
"225": "\u0111i v\u0103ng",
|
251 |
+
"226": "s\u00e2n kh\u1ea5u",
|
252 |
+
"227": "n\u1ebfn",
|
253 |
+
"228": "bu\u1ed3ng",
|
254 |
+
"229": "c\u00e1i th\u00eca",
|
255 |
+
"230": "c\u1ecf kh\u00f4",
|
256 |
+
"231": "con kh\u1ec9",
|
257 |
+
"232": "t\u01b0\u1ee3ng \u0111\u00e0i",
|
258 |
+
"233": "t\u1ee7 \u0111\u00f4ng",
|
259 |
+
"234": "hoa h\u1ed3ng",
|
260 |
+
"235": "chim b\u1ed3 c\u00e2u",
|
261 |
+
"236": "hay",
|
262 |
+
"237": "g\u1ea7u m\u00fac",
|
263 |
+
"238": "b\u00fai t\u00f3c",
|
264 |
+
"239": "m\u00f3ng vu\u1ed1t",
|
265 |
+
"240": "xe \u0111i\u1ec7n",
|
266 |
+
"241": "\u0111\u0129a",
|
267 |
+
"242": "m\u00e0n",
|
268 |
+
"243": "\u00e1o kho\u00e1c",
|
269 |
+
"244": "m\u1eb7t n\u1ea1",
|
270 |
+
"245": "\u0111\u1ed3 u\u1ed1ng",
|
271 |
+
"246": "b\u00f2 \u0111\u1ef1c",
|
272 |
+
"247": "c\u00e1i n\u0129a",
|
273 |
+
"248": "\u0111\u01b0\u1eddng \u1ed1ng",
|
274 |
+
"249": "n\u01b0\u1edbc ti\u1ec3u",
|
275 |
+
"250": "ly",
|
276 |
+
"251": "\u0111\u00e8n \u0111\u1ec3 b\u00e0n",
|
277 |
+
"252": "\u0111\u1ed3 n\u1ed9i th\u1ea5t",
|
278 |
+
"253": "m\u00e1i ch\u00e8o",
|
279 |
+
"254": "\u0111\u1ea7u m\u00e1y",
|
280 |
+
"255": "\u0111\u1ea7m",
|
281 |
+
"256": "m\u0169 l\u01b0\u1ee1i trai",
|
282 |
+
"257": "truy\u1ec1n h\u00ecnh",
|
283 |
+
"258": "ph\u00f4 mai",
|
284 |
+
"259": "c\u00e0 ph\u00ea",
|
285 |
+
"260": "b\u1ebfn t\u00e0u",
|
286 |
+
"261": "con d\u00ea",
|
287 |
+
"262": "c\u1eeda ra v\u00e0o",
|
288 |
+
"263": "k\u00fd t\u00ean",
|
289 |
+
"264": "thi\u1ebft b\u1ecb",
|
290 |
+
"265": "b\u00ecnh hoa",
|
291 |
+
"266": "bia",
|
292 |
+
"267": "con d\u1ed1c",
|
293 |
+
"268": "\u00e1o cho\u00e0ng",
|
294 |
+
"269": "m\u00f3n tr\u00e1ng mi\u1ec7ng",
|
295 |
+
"270": "c\u00e2y s\u00e0o",
|
296 |
+
"271": "thu\u1ed1c l\u00e1",
|
297 |
+
"272": "m\u1eb7t",
|
298 |
+
"273": "k\u00ednh r\u00e2m",
|
299 |
+
"274": "\u0111i\u00eau kh\u1eafc",
|
300 |
+
"275": "nh\u00e0",
|
301 |
+
"276": "rau qu\u1ea3",
|
302 |
+
"277": "tr\u00e1i c\u00e2y",
|
303 |
+
"278": "qu\u1ea3 cam",
|
304 |
+
"279": "\u0111\u0129a n\u00e9m",
|
305 |
+
"280": "ba lan",
|
306 |
+
"281": "c\u00e2y g\u1eady",
|
307 |
+
"282": "s\u1eefa",
|
308 |
+
"283": "h\u1ed9p \u0111\u1ef1ng",
|
309 |
+
"284": "khung",
|
310 |
+
"285": "ngo\u00e0i tr\u1eddi",
|
311 |
+
"286": "\u0111o\u1ea1n phim gi\u1edbi thi\u1ec7u",
|
312 |
+
"287": "c\u1edd",
|
313 |
+
"288": "th\u00f9ng",
|
314 |
+
"289": "l\u00f2 s\u01b0\u1edfi",
|
315 |
+
"290": "l\u00e1t c\u1eaft",
|
316 |
+
"291": "b\u1eafp ch\u00e2n",
|
317 |
+
"292": "c\u00fan y\u00eau",
|
318 |
+
"293": "ng\u00e2n h\u00e0ng",
|
319 |
+
"294": "rau x\u00e0 l\u00e1ch",
|
320 |
+
"295": "xa l\u1ed9",
|
321 |
+
"296": "g\u00e0",
|
322 |
+
"297": "qu\u1ea7n short",
|
323 |
+
"298": "v\u00f2i n\u01b0\u1edbc",
|
324 |
+
"299": "m\u0169 b\u1ea3o hi\u1ec3m",
|
325 |
+
"300": "c\u00f4ng c\u1ee5",
|
326 |
+
"301": "qu\u1ea3 cam ",
|
327 |
+
"302": "v\u00e1n tr\u01b0\u1ee3t tuy\u1ebft",
|
328 |
+
"303": "g\u1ea1ch",
|
329 |
+
"304": "ch\u00ecm xu\u1ed1ng",
|
330 |
+
"305": "kh\u0103n t\u1eafm",
|
331 |
+
"306": "l\u00e1t g\u1ea1ch",
|
332 |
+
"307": "ng\u0103n",
|
333 |
+
"308": "b\u1ea3ng hi\u1ec7u",
|
334 |
+
"309": "l\u0103n tr\u00f2n",
|
335 |
+
"310": "hotdog",
|
336 |
+
"311": "c\u1ecf",
|
337 |
+
"312": "b\u00ecnh",
|
338 |
+
"313": "b\u00ean",
|
339 |
+
"314": "t\u00e0u ho\u1ea3",
|
340 |
+
"315": "b\u00e1nh xe",
|
341 |
+
"316": "lon",
|
342 |
+
"317": "nh\u00e0 t\u1eafm",
|
343 |
+
"318": "\u0111\u01b0\u1eddng \u0111ua",
|
344 |
+
"319": "m\u00e0u s\u1eafc",
|
345 |
+
"320": "bao b\u00ec",
|
346 |
+
"321": "th\u00e0nh ph\u1ea7n",
|
347 |
+
"322": "chim \u01b0ng",
|
348 |
+
"323": "\u0111i\u1ec3m t\u00e2m",
|
349 |
+
"324": "d\u0129a ",
|
350 |
+
"325": "b\u00e0n ch\u1ea3i \u0111\u00e1nh r\u0103ng",
|
351 |
+
"326": "h\u00e0ng h\u00f3a",
|
352 |
+
"327": "pug",
|
353 |
+
"328": "h\u1ed9p s\u1ed1",
|
354 |
+
"329": "c\u00e1",
|
355 |
+
"330": "gi\u1ecf",
|
356 |
+
"331": "gh\u1ebf s\u00f4 pha",
|
357 |
+
"332": "qu\u1ea7n \u00e1o",
|
358 |
+
"333": "tr\u01b0\u1eddng h\u1ee3p",
|
359 |
+
"334": "b\u00f2",
|
360 |
+
"335": "v\u00f4 tuy\u1ebfn",
|
361 |
+
"336": "con thoi",
|
362 |
+
"337": "theo d\u00f5i",
|
363 |
+
"338": "\u00e1o ba l\u1ed7",
|
364 |
+
"339": "d\u00f2ng s\u00f4ng",
|
365 |
+
"340": "g\u00e0 t\u00e2y",
|
366 |
+
"341": "d\u1ea5u hi\u1ec7u",
|
367 |
+
"342": "m\u00e8o con",
|
368 |
+
"343": "m\u1eaft",
|
369 |
+
"344": "\u0111\u01b0a \u0111\u00f3n",
|
370 |
+
"345": "con heo",
|
371 |
+
"346": "ngo\u00e0i",
|
372 |
+
"347": "\u0111\u1ed3ng ph\u1ee5c",
|
373 |
+
"348": "m\u00e1y bay tr\u1ef1c th\u0103ng",
|
374 |
+
"349": "\u0111\u1ea1i d\u01b0\u01a1ng",
|
375 |
+
"350": "b\u1ee9c m\u00e0n",
|
376 |
+
"351": "cam",
|
377 |
+
"352": "b\u00e1nh hamburger"
|
378 |
+
},
|
379 |
"img_size": 224,
|
380 |
"in_chans": 3,
|
381 |
+
"label2id": null,
|
382 |
"layernorm_embedding": false,
|
383 |
"layernorm_eps": 1e-05,
|
384 |
"max_rel_pos": 0,
|
model.safetensors
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4911305908
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:350969d1b1809558a103e887928ed65b68950e1d33edaff47e549c923b5b7691
|
3 |
size 4911305908
|
modeling_vivqa.py
CHANGED
@@ -38,10 +38,12 @@ class Blip2EfficientExtractor(nn.Module):
|
|
38 |
|
39 |
# Efficientnet
|
40 |
self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
|
|
|
41 |
self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
|
42 |
self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
|
43 |
|
44 |
-
def forward(self, images):
|
|
|
45 |
global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
|
46 |
|
47 |
local_features = self.model_efficient.extract_features(images)
|
@@ -111,18 +113,19 @@ class ViVQABEiT3(PreTrainedModel):
|
|
111 |
x1 = self.vision_embed(visual_tokens)
|
112 |
multiway_split_position = x1.size(1)
|
113 |
|
114 |
-
x2 = self.text_embed(textual_tokens, text_padding_position)
|
115 |
x2 = self.linear(x2)
|
116 |
|
117 |
x = torch.cat([x1, x2], dim=1)
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
|
|
126 |
encoder_out = self.encoder(
|
127 |
src_tokens=None,
|
128 |
encoder_padding_mask=encoder_padding_mask,
|
|
|
38 |
|
39 |
# Efficientnet
|
40 |
self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
|
41 |
+
self.model_efficient.eval()
|
42 |
self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
|
43 |
self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
|
44 |
|
45 |
+
def forward(self, images):
|
46 |
+
|
47 |
global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
|
48 |
|
49 |
local_features = self.model_efficient.extract_features(images)
|
|
|
113 |
x1 = self.vision_embed(visual_tokens)
|
114 |
multiway_split_position = x1.size(1)
|
115 |
|
116 |
+
x2 = self.text_embed(textual_tokens, 1-text_padding_position)
|
117 |
x2 = self.linear(x2)
|
118 |
|
119 |
x = torch.cat([x1, x2], dim=1)
|
120 |
+
|
121 |
+
encoder_padding_mask = torch.cat(
|
122 |
+
[
|
123 |
+
torch.zeros(x1.shape[:-1]).to(x1.device).bool(),
|
124 |
+
text_padding_position,
|
125 |
+
],
|
126 |
+
dim=1,
|
127 |
+
)
|
128 |
+
|
129 |
encoder_out = self.encoder(
|
130 |
src_tokens=None,
|
131 |
encoder_padding_mask=encoder_padding_mask,
|