KoichiYasuoka commited on
Commit
df327a7
·
1 Parent(s): afec0a4

model improved

Browse files
Files changed (4) hide show
  1. config.json +436 -380
  2. maker.py +45 -39
  3. pytorch_model.bin +2 -2
  4. ud.py +57 -32
config.json CHANGED
@@ -34,392 +34,448 @@
34
  "hidden_size": 1024,
35
  "id2label": {
36
  "0": "ADJ",
37
- "1": "ADJ|acl-s",
38
- "2": "ADJ|acl-t",
39
- "3": "ADJ|advcl-s",
40
- "4": "ADJ|advcl-t",
41
- "5": "ADJ|amod-s",
42
- "6": "ADJ|amod-t",
43
- "7": "ADJ|ccomp-s",
44
- "8": "ADJ|ccomp-t",
45
- "9": "ADJ|csubj-s",
46
- "10": "ADJ|csubj-t",
47
- "11": "ADJ|csubj:outer-s",
48
- "12": "ADJ|csubj:outer-t",
49
- "13": "ADJ|dep-s",
50
- "14": "ADJ|dep-t",
51
- "15": "ADJ|nmod-s",
52
- "16": "ADJ|nmod-t",
53
- "17": "ADJ|nsubj-s",
54
- "18": "ADJ|nsubj-t",
55
- "19": "ADJ|obj-s",
56
- "20": "ADJ|obj-t",
57
- "21": "ADJ|obl-s",
58
- "22": "ADJ|obl-t",
59
- "23": "ADJ|root",
60
- "24": "ADP",
61
- "25": "ADP|case-s",
62
- "26": "ADP|case-t",
63
- "27": "ADP|fixed-s",
64
- "28": "ADP|fixed-t",
65
- "29": "ADV",
66
- "30": "ADV|advcl-s",
67
- "31": "ADV|advcl-t",
68
- "32": "ADV|advmod-s",
69
- "33": "ADV|advmod-t",
70
- "34": "ADV|dep-s",
71
- "35": "ADV|dep-t",
72
- "36": "ADV|obj-s",
73
- "37": "ADV|obj-t",
74
- "38": "ADV|root",
75
- "39": "AUX",
76
- "40": "AUX|Polarity=Neg",
77
- "41": "AUX|Polarity=Neg|aux-s",
78
- "42": "AUX|Polarity=Neg|aux-t",
79
- "43": "AUX|Polarity=Neg|fixed-s",
80
- "44": "AUX|Polarity=Neg|fixed-t",
81
- "45": "AUX|aux-s",
82
- "46": "AUX|aux-t",
83
- "47": "AUX|cop-s",
84
- "48": "AUX|cop-t",
85
- "49": "AUX|fixed-s",
86
- "50": "AUX|fixed-t",
87
- "51": "AUX|root",
88
- "52": "CCONJ",
89
- "53": "CCONJ|cc-s",
90
- "54": "CCONJ|cc-t",
91
- "55": "DET",
92
- "56": "DET|det-s",
93
- "57": "DET|det-t",
94
- "58": "INTJ",
95
- "59": "INTJ|discourse-s",
96
- "60": "INTJ|discourse-t",
97
- "61": "INTJ|root",
98
- "62": "NOUN",
99
- "63": "NOUN|Polarity=Neg",
100
- "64": "NOUN|Polarity=Neg|obl-s",
101
- "65": "NOUN|Polarity=Neg|obl-t",
102
- "66": "NOUN|Polarity=Neg|root",
103
- "67": "NOUN|acl-s",
104
- "68": "NOUN|acl-t",
105
- "69": "NOUN|advcl-s",
106
- "70": "NOUN|advcl-t",
107
- "71": "NOUN|ccomp-s",
108
- "72": "NOUN|ccomp-t",
109
- "73": "NOUN|compound-s",
110
- "74": "NOUN|compound-t",
111
- "75": "NOUN|csubj-s",
112
- "76": "NOUN|csubj-t",
113
- "77": "NOUN|csubj:outer-s",
114
- "78": "NOUN|csubj:outer-t",
115
- "79": "NOUN|nmod-s",
116
- "80": "NOUN|nmod-t",
117
- "81": "NOUN|nsubj-s",
118
- "82": "NOUN|nsubj-t",
119
- "83": "NOUN|nsubj:outer-s",
120
- "84": "NOUN|nsubj:outer-t",
121
- "85": "NOUN|obj-s",
122
- "86": "NOUN|obj-t",
123
- "87": "NOUN|obl-s",
124
- "88": "NOUN|obl-t",
125
- "89": "NOUN|root",
126
- "90": "NUM",
127
- "91": "NUM|advcl-s",
128
- "92": "NUM|advcl-t",
129
- "93": "NUM|compound-s",
130
- "94": "NUM|compound-t",
131
- "95": "NUM|nmod-s",
132
- "96": "NUM|nmod-t",
133
- "97": "NUM|nsubj-s",
134
- "98": "NUM|nsubj-t",
135
- "99": "NUM|nsubj:outer-s",
136
- "100": "NUM|nsubj:outer-t",
137
- "101": "NUM|nummod-s",
138
- "102": "NUM|nummod-t",
139
- "103": "NUM|obj-s",
140
- "104": "NUM|obj-t",
141
- "105": "NUM|obl-s",
142
- "106": "NUM|obl-t",
143
- "107": "NUM|root",
144
- "108": "PART",
145
- "109": "PART|mark-s",
146
- "110": "PART|mark-t",
147
- "111": "PRON",
148
- "112": "PRON|acl-s",
149
- "113": "PRON|acl-t",
150
- "114": "PRON|advcl-s",
151
- "115": "PRON|advcl-t",
152
- "116": "PRON|nmod-s",
153
- "117": "PRON|nmod-t",
154
- "118": "PRON|nsubj-s",
155
- "119": "PRON|nsubj-t",
156
- "120": "PRON|nsubj:outer-s",
157
- "121": "PRON|nsubj:outer-t",
158
- "122": "PRON|obj-s",
159
- "123": "PRON|obj-t",
160
- "124": "PRON|obl-s",
161
- "125": "PRON|obl-t",
162
- "126": "PRON|root",
163
- "127": "PROPN",
164
- "128": "PROPN|acl-s",
165
- "129": "PROPN|acl-t",
166
- "130": "PROPN|advcl-s",
167
- "131": "PROPN|advcl-t",
168
- "132": "PROPN|compound-s",
169
- "133": "PROPN|compound-t",
170
- "134": "PROPN|nmod-s",
171
- "135": "PROPN|nmod-t",
172
- "136": "PROPN|nsubj-s",
173
- "137": "PROPN|nsubj-t",
174
- "138": "PROPN|nsubj:outer-s",
175
- "139": "PROPN|nsubj:outer-t",
176
- "140": "PROPN|obj-s",
177
- "141": "PROPN|obj-t",
178
- "142": "PROPN|obl-s",
179
- "143": "PROPN|obl-t",
180
- "144": "PROPN|root",
181
- "145": "PUNCT",
182
- "146": "PUNCT|punct-s",
183
- "147": "PUNCT|punct-t",
184
- "148": "SCONJ",
185
- "149": "SCONJ|dep-s",
186
- "150": "SCONJ|dep-t",
187
- "151": "SCONJ|fixed-s",
188
- "152": "SCONJ|fixed-t",
189
- "153": "SCONJ|mark-s",
190
- "154": "SCONJ|mark-t",
191
- "155": "SYM",
192
- "156": "SYM|compound-s",
193
- "157": "SYM|compound-t",
194
- "158": "SYM|dep-s",
195
- "159": "SYM|dep-t",
196
- "160": "SYM|nmod-s",
197
- "161": "SYM|nmod-t",
198
- "162": "SYM|obl-s",
199
- "163": "SYM|obl-t",
200
- "164": "VERB",
201
- "165": "VERB|acl-s",
202
- "166": "VERB|acl-t",
203
- "167": "VERB|advcl-s",
204
- "168": "VERB|advcl-t",
205
- "169": "VERB|ccomp-s",
206
- "170": "VERB|ccomp-t",
207
- "171": "VERB|compound-s",
208
- "172": "VERB|compound-t",
209
- "173": "VERB|csubj-s",
210
- "174": "VERB|csubj-t",
211
- "175": "VERB|csubj:outer-s",
212
- "176": "VERB|csubj:outer-t",
213
- "177": "VERB|nmod-s",
214
- "178": "VERB|nmod-t",
215
- "179": "VERB|obj-s",
216
- "180": "VERB|obj-t",
217
- "181": "VERB|obl-s",
218
- "182": "VERB|obl-t",
219
- "183": "VERB|root",
220
- "184": "X",
221
- "185": "X|dep-s",
222
- "186": "X|dep-t",
223
- "187": "X|goeswith-s",
224
- "188": "X|goeswith-t",
225
- "189": "X|nmod-s",
226
- "190": "X|nmod-t"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  },
228
  "initializer_cutoff_factor": 2.0,
229
  "initializer_range": 0.02,
230
  "intermediate_size": 2624,
231
  "label2id": {
232
  "ADJ": 0,
233
- "ADJ|acl-s": 1,
234
- "ADJ|acl-t": 2,
235
- "ADJ|advcl-s": 3,
236
- "ADJ|advcl-t": 4,
237
- "ADJ|amod-s": 5,
238
- "ADJ|amod-t": 6,
239
- "ADJ|ccomp-s": 7,
240
- "ADJ|ccomp-t": 8,
241
- "ADJ|csubj-s": 9,
242
- "ADJ|csubj-t": 10,
243
- "ADJ|csubj:outer-s": 11,
244
- "ADJ|csubj:outer-t": 12,
245
- "ADJ|dep-s": 13,
246
- "ADJ|dep-t": 14,
247
- "ADJ|nmod-s": 15,
248
- "ADJ|nmod-t": 16,
249
- "ADJ|nsubj-s": 17,
250
- "ADJ|nsubj-t": 18,
251
- "ADJ|obj-s": 19,
252
- "ADJ|obj-t": 20,
253
- "ADJ|obl-s": 21,
254
- "ADJ|obl-t": 22,
255
- "ADJ|root": 23,
256
- "ADP": 24,
257
- "ADP|case-s": 25,
258
- "ADP|case-t": 26,
259
- "ADP|fixed-s": 27,
260
- "ADP|fixed-t": 28,
261
- "ADV": 29,
262
- "ADV|advcl-s": 30,
263
- "ADV|advcl-t": 31,
264
- "ADV|advmod-s": 32,
265
- "ADV|advmod-t": 33,
266
- "ADV|dep-s": 34,
267
- "ADV|dep-t": 35,
268
- "ADV|obj-s": 36,
269
- "ADV|obj-t": 37,
270
- "ADV|root": 38,
271
- "AUX": 39,
272
- "AUX|Polarity=Neg": 40,
273
- "AUX|Polarity=Neg|aux-s": 41,
274
- "AUX|Polarity=Neg|aux-t": 42,
275
- "AUX|Polarity=Neg|fixed-s": 43,
276
- "AUX|Polarity=Neg|fixed-t": 44,
277
- "AUX|aux-s": 45,
278
- "AUX|aux-t": 46,
279
- "AUX|cop-s": 47,
280
- "AUX|cop-t": 48,
281
- "AUX|fixed-s": 49,
282
- "AUX|fixed-t": 50,
283
- "AUX|root": 51,
284
- "CCONJ": 52,
285
- "CCONJ|cc-s": 53,
286
- "CCONJ|cc-t": 54,
287
- "DET": 55,
288
- "DET|det-s": 56,
289
- "DET|det-t": 57,
290
- "INTJ": 58,
291
- "INTJ|discourse-s": 59,
292
- "INTJ|discourse-t": 60,
293
- "INTJ|root": 61,
294
- "NOUN": 62,
295
- "NOUN|Polarity=Neg": 63,
296
- "NOUN|Polarity=Neg|obl-s": 64,
297
- "NOUN|Polarity=Neg|obl-t": 65,
298
- "NOUN|Polarity=Neg|root": 66,
299
- "NOUN|acl-s": 67,
300
- "NOUN|acl-t": 68,
301
- "NOUN|advcl-s": 69,
302
- "NOUN|advcl-t": 70,
303
- "NOUN|ccomp-s": 71,
304
- "NOUN|ccomp-t": 72,
305
- "NOUN|compound-s": 73,
306
- "NOUN|compound-t": 74,
307
- "NOUN|csubj-s": 75,
308
- "NOUN|csubj-t": 76,
309
- "NOUN|csubj:outer-s": 77,
310
- "NOUN|csubj:outer-t": 78,
311
- "NOUN|nmod-s": 79,
312
- "NOUN|nmod-t": 80,
313
- "NOUN|nsubj-s": 81,
314
- "NOUN|nsubj-t": 82,
315
- "NOUN|nsubj:outer-s": 83,
316
- "NOUN|nsubj:outer-t": 84,
317
- "NOUN|obj-s": 85,
318
- "NOUN|obj-t": 86,
319
- "NOUN|obl-s": 87,
320
- "NOUN|obl-t": 88,
321
- "NOUN|root": 89,
322
- "NUM": 90,
323
- "NUM|advcl-s": 91,
324
- "NUM|advcl-t": 92,
325
- "NUM|compound-s": 93,
326
- "NUM|compound-t": 94,
327
- "NUM|nmod-s": 95,
328
- "NUM|nmod-t": 96,
329
- "NUM|nsubj-s": 97,
330
- "NUM|nsubj-t": 98,
331
- "NUM|nsubj:outer-s": 99,
332
- "NUM|nsubj:outer-t": 100,
333
- "NUM|nummod-s": 101,
334
- "NUM|nummod-t": 102,
335
- "NUM|obj-s": 103,
336
- "NUM|obj-t": 104,
337
- "NUM|obl-s": 105,
338
- "NUM|obl-t": 106,
339
- "NUM|root": 107,
340
- "PART": 108,
341
- "PART|mark-s": 109,
342
- "PART|mark-t": 110,
343
- "PRON": 111,
344
- "PRON|acl-s": 112,
345
- "PRON|acl-t": 113,
346
- "PRON|advcl-s": 114,
347
- "PRON|advcl-t": 115,
348
- "PRON|nmod-s": 116,
349
- "PRON|nmod-t": 117,
350
- "PRON|nsubj-s": 118,
351
- "PRON|nsubj-t": 119,
352
- "PRON|nsubj:outer-s": 120,
353
- "PRON|nsubj:outer-t": 121,
354
- "PRON|obj-s": 122,
355
- "PRON|obj-t": 123,
356
- "PRON|obl-s": 124,
357
- "PRON|obl-t": 125,
358
- "PRON|root": 126,
359
- "PROPN": 127,
360
- "PROPN|acl-s": 128,
361
- "PROPN|acl-t": 129,
362
- "PROPN|advcl-s": 130,
363
- "PROPN|advcl-t": 131,
364
- "PROPN|compound-s": 132,
365
- "PROPN|compound-t": 133,
366
- "PROPN|nmod-s": 134,
367
- "PROPN|nmod-t": 135,
368
- "PROPN|nsubj-s": 136,
369
- "PROPN|nsubj-t": 137,
370
- "PROPN|nsubj:outer-s": 138,
371
- "PROPN|nsubj:outer-t": 139,
372
- "PROPN|obj-s": 140,
373
- "PROPN|obj-t": 141,
374
- "PROPN|obl-s": 142,
375
- "PROPN|obl-t": 143,
376
- "PROPN|root": 144,
377
- "PUNCT": 145,
378
- "PUNCT|punct-s": 146,
379
- "PUNCT|punct-t": 147,
380
- "SCONJ": 148,
381
- "SCONJ|dep-s": 149,
382
- "SCONJ|dep-t": 150,
383
- "SCONJ|fixed-s": 151,
384
- "SCONJ|fixed-t": 152,
385
- "SCONJ|mark-s": 153,
386
- "SCONJ|mark-t": 154,
387
- "SYM": 155,
388
- "SYM|compound-s": 156,
389
- "SYM|compound-t": 157,
390
- "SYM|dep-s": 158,
391
- "SYM|dep-t": 159,
392
- "SYM|nmod-s": 160,
393
- "SYM|nmod-t": 161,
394
- "SYM|obl-s": 162,
395
- "SYM|obl-t": 163,
396
- "VERB": 164,
397
- "VERB|acl-s": 165,
398
- "VERB|acl-t": 166,
399
- "VERB|advcl-s": 167,
400
- "VERB|advcl-t": 168,
401
- "VERB|ccomp-s": 169,
402
- "VERB|ccomp-t": 170,
403
- "VERB|compound-s": 171,
404
- "VERB|compound-t": 172,
405
- "VERB|csubj-s": 173,
406
- "VERB|csubj-t": 174,
407
- "VERB|csubj:outer-s": 175,
408
- "VERB|csubj:outer-t": 176,
409
- "VERB|nmod-s": 177,
410
- "VERB|nmod-t": 178,
411
- "VERB|obj-s": 179,
412
- "VERB|obj-t": 180,
413
- "VERB|obl-s": 181,
414
- "VERB|obl-t": 182,
415
- "VERB|root": 183,
416
- "X": 184,
417
- "X|dep-s": 185,
418
- "X|dep-t": 186,
419
- "X|goeswith-s": 187,
420
- "X|goeswith-t": 188,
421
- "X|nmod-s": 189,
422
- "X|nmod-t": 190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  },
424
  "layer_norm_eps": 1e-05,
425
  "local_attention": 128,
 
34
  "hidden_size": 1024,
35
  "id2label": {
36
  "0": "ADJ",
37
+ "1": "ADJ.",
38
+ "2": "ADJ.|[acl]",
39
+ "3": "ADJ.|[advcl]",
40
+ "4": "ADJ.|[amod]",
41
+ "5": "ADJ.|[ccomp]",
42
+ "6": "ADJ.|[csubj:outer]",
43
+ "7": "ADJ.|[csubj]",
44
+ "8": "ADJ.|[dep]",
45
+ "9": "ADJ.|[nmod]",
46
+ "10": "ADJ.|[nsubj]",
47
+ "11": "ADJ.|[obj]",
48
+ "12": "ADJ.|[obl]",
49
+ "13": "ADJ.|[root]",
50
+ "14": "ADJ|[acl]",
51
+ "15": "ADJ|[advcl]",
52
+ "16": "ADJ|[amod]",
53
+ "17": "ADJ|[ccomp]",
54
+ "18": "ADJ|[csubj:outer]",
55
+ "19": "ADJ|[csubj]",
56
+ "20": "ADJ|[dep]",
57
+ "21": "ADJ|[nmod]",
58
+ "22": "ADJ|[nsubj]",
59
+ "23": "ADJ|[obj]",
60
+ "24": "ADJ|[obl]",
61
+ "25": "ADJ|[root]",
62
+ "26": "ADP",
63
+ "27": "ADP.",
64
+ "28": "ADP.|[case]",
65
+ "29": "ADP.|[fixed]",
66
+ "30": "ADP|[case]",
67
+ "31": "ADP|[fixed]",
68
+ "32": "ADV",
69
+ "33": "ADV.",
70
+ "34": "ADV.|[advcl]",
71
+ "35": "ADV.|[advmod]",
72
+ "36": "ADV.|[dep]",
73
+ "37": "ADV.|[obj]",
74
+ "38": "ADV.|[root]",
75
+ "39": "ADV|[advcl]",
76
+ "40": "ADV|[advmod]",
77
+ "41": "ADV|[dep]",
78
+ "42": "ADV|[obj]",
79
+ "43": "ADV|[root]",
80
+ "44": "AUX",
81
+ "45": "AUX.",
82
+ "46": "AUX.|Polarity=Neg",
83
+ "47": "AUX.|Polarity=Neg|[aux]",
84
+ "48": "AUX.|Polarity=Neg|[fixed]",
85
+ "49": "AUX.|[aux]",
86
+ "50": "AUX.|[cop]",
87
+ "51": "AUX.|[fixed]",
88
+ "52": "AUX.|[root]",
89
+ "53": "AUX|Polarity=Neg",
90
+ "54": "AUX|Polarity=Neg|[aux]",
91
+ "55": "AUX|Polarity=Neg|[fixed]",
92
+ "56": "AUX|[aux]",
93
+ "57": "AUX|[cop]",
94
+ "58": "AUX|[fixed]",
95
+ "59": "AUX|[root]",
96
+ "60": "CCONJ",
97
+ "61": "CCONJ.",
98
+ "62": "CCONJ.|[cc]",
99
+ "63": "CCONJ|[cc]",
100
+ "64": "DET",
101
+ "65": "DET.",
102
+ "66": "DET.|[det]",
103
+ "67": "DET|[det]",
104
+ "68": "INTJ",
105
+ "69": "INTJ.",
106
+ "70": "INTJ.|[discourse]",
107
+ "71": "INTJ.|[root]",
108
+ "72": "INTJ|[discourse]",
109
+ "73": "INTJ|[root]",
110
+ "74": "NOUN",
111
+ "75": "NOUN.",
112
+ "76": "NOUN.|Polarity=Neg",
113
+ "77": "NOUN.|Polarity=Neg|[obl]",
114
+ "78": "NOUN.|Polarity=Neg|[root]",
115
+ "79": "NOUN.|[acl]",
116
+ "80": "NOUN.|[advcl]",
117
+ "81": "NOUN.|[ccomp]",
118
+ "82": "NOUN.|[compound]",
119
+ "83": "NOUN.|[csubj:outer]",
120
+ "84": "NOUN.|[csubj]",
121
+ "85": "NOUN.|[nmod]",
122
+ "86": "NOUN.|[nsubj:outer]",
123
+ "87": "NOUN.|[nsubj]",
124
+ "88": "NOUN.|[obj]",
125
+ "89": "NOUN.|[obl]",
126
+ "90": "NOUN.|[root]",
127
+ "91": "NOUN|Polarity=Neg",
128
+ "92": "NOUN|Polarity=Neg|[obl]",
129
+ "93": "NOUN|Polarity=Neg|[root]",
130
+ "94": "NOUN|[acl]",
131
+ "95": "NOUN|[advcl]",
132
+ "96": "NOUN|[ccomp]",
133
+ "97": "NOUN|[compound]",
134
+ "98": "NOUN|[csubj:outer]",
135
+ "99": "NOUN|[csubj]",
136
+ "100": "NOUN|[nmod]",
137
+ "101": "NOUN|[nsubj:outer]",
138
+ "102": "NOUN|[nsubj]",
139
+ "103": "NOUN|[obj]",
140
+ "104": "NOUN|[obl]",
141
+ "105": "NOUN|[root]",
142
+ "106": "NUM",
143
+ "107": "NUM.",
144
+ "108": "NUM.|[advcl]",
145
+ "109": "NUM.|[compound]",
146
+ "110": "NUM.|[nmod]",
147
+ "111": "NUM.|[nsubj:outer]",
148
+ "112": "NUM.|[nsubj]",
149
+ "113": "NUM.|[nummod]",
150
+ "114": "NUM.|[obj]",
151
+ "115": "NUM.|[obl]",
152
+ "116": "NUM.|[root]",
153
+ "117": "NUM|[advcl]",
154
+ "118": "NUM|[compound]",
155
+ "119": "NUM|[nmod]",
156
+ "120": "NUM|[nsubj:outer]",
157
+ "121": "NUM|[nsubj]",
158
+ "122": "NUM|[nummod]",
159
+ "123": "NUM|[obj]",
160
+ "124": "NUM|[obl]",
161
+ "125": "NUM|[root]",
162
+ "126": "PART",
163
+ "127": "PART.",
164
+ "128": "PART.|[mark]",
165
+ "129": "PART|[mark]",
166
+ "130": "PRON",
167
+ "131": "PRON.",
168
+ "132": "PRON.|[acl]",
169
+ "133": "PRON.|[advcl]",
170
+ "134": "PRON.|[nmod]",
171
+ "135": "PRON.|[nsubj:outer]",
172
+ "136": "PRON.|[nsubj]",
173
+ "137": "PRON.|[obj]",
174
+ "138": "PRON.|[obl]",
175
+ "139": "PRON.|[root]",
176
+ "140": "PRON|[acl]",
177
+ "141": "PRON|[advcl]",
178
+ "142": "PRON|[nmod]",
179
+ "143": "PRON|[nsubj:outer]",
180
+ "144": "PRON|[nsubj]",
181
+ "145": "PRON|[obj]",
182
+ "146": "PRON|[obl]",
183
+ "147": "PRON|[root]",
184
+ "148": "PROPN",
185
+ "149": "PROPN.",
186
+ "150": "PROPN.|[acl]",
187
+ "151": "PROPN.|[advcl]",
188
+ "152": "PROPN.|[compound]",
189
+ "153": "PROPN.|[nmod]",
190
+ "154": "PROPN.|[nsubj:outer]",
191
+ "155": "PROPN.|[nsubj]",
192
+ "156": "PROPN.|[obj]",
193
+ "157": "PROPN.|[obl]",
194
+ "158": "PROPN.|[root]",
195
+ "159": "PROPN|[acl]",
196
+ "160": "PROPN|[advcl]",
197
+ "161": "PROPN|[compound]",
198
+ "162": "PROPN|[nmod]",
199
+ "163": "PROPN|[nsubj:outer]",
200
+ "164": "PROPN|[nsubj]",
201
+ "165": "PROPN|[obj]",
202
+ "166": "PROPN|[obl]",
203
+ "167": "PROPN|[root]",
204
+ "168": "PUNCT",
205
+ "169": "PUNCT.",
206
+ "170": "PUNCT.|[punct]",
207
+ "171": "PUNCT|[punct]",
208
+ "172": "SCONJ",
209
+ "173": "SCONJ.",
210
+ "174": "SCONJ.|[dep]",
211
+ "175": "SCONJ.|[fixed]",
212
+ "176": "SCONJ.|[mark]",
213
+ "177": "SCONJ|[dep]",
214
+ "178": "SCONJ|[fixed]",
215
+ "179": "SCONJ|[mark]",
216
+ "180": "SYM",
217
+ "181": "SYM.",
218
+ "182": "SYM.|[compound]",
219
+ "183": "SYM.|[dep]",
220
+ "184": "SYM.|[nmod]",
221
+ "185": "SYM.|[obl]",
222
+ "186": "SYM|[compound]",
223
+ "187": "SYM|[dep]",
224
+ "188": "SYM|[nmod]",
225
+ "189": "SYM|[obl]",
226
+ "190": "VERB",
227
+ "191": "VERB.",
228
+ "192": "VERB.|[acl]",
229
+ "193": "VERB.|[advcl]",
230
+ "194": "VERB.|[ccomp]",
231
+ "195": "VERB.|[compound]",
232
+ "196": "VERB.|[csubj:outer]",
233
+ "197": "VERB.|[csubj]",
234
+ "198": "VERB.|[nmod]",
235
+ "199": "VERB.|[obj]",
236
+ "200": "VERB.|[obl]",
237
+ "201": "VERB.|[root]",
238
+ "202": "VERB|[acl]",
239
+ "203": "VERB|[advcl]",
240
+ "204": "VERB|[ccomp]",
241
+ "205": "VERB|[compound]",
242
+ "206": "VERB|[csubj:outer]",
243
+ "207": "VERB|[csubj]",
244
+ "208": "VERB|[nmod]",
245
+ "209": "VERB|[obj]",
246
+ "210": "VERB|[obl]",
247
+ "211": "VERB|[root]",
248
+ "212": "X",
249
+ "213": "X.",
250
+ "214": "X.|[dep]",
251
+ "215": "X.|[goeswith]",
252
+ "216": "X.|[nmod]",
253
+ "217": "X|[dep]",
254
+ "218": "X|[nmod]"
255
  },
256
  "initializer_cutoff_factor": 2.0,
257
  "initializer_range": 0.02,
258
  "intermediate_size": 2624,
259
  "label2id": {
260
  "ADJ": 0,
261
+ "ADJ.": 1,
262
+ "ADJ.|[acl]": 2,
263
+ "ADJ.|[advcl]": 3,
264
+ "ADJ.|[amod]": 4,
265
+ "ADJ.|[ccomp]": 5,
266
+ "ADJ.|[csubj:outer]": 6,
267
+ "ADJ.|[csubj]": 7,
268
+ "ADJ.|[dep]": 8,
269
+ "ADJ.|[nmod]": 9,
270
+ "ADJ.|[nsubj]": 10,
271
+ "ADJ.|[obj]": 11,
272
+ "ADJ.|[obl]": 12,
273
+ "ADJ.|[root]": 13,
274
+ "ADJ|[acl]": 14,
275
+ "ADJ|[advcl]": 15,
276
+ "ADJ|[amod]": 16,
277
+ "ADJ|[ccomp]": 17,
278
+ "ADJ|[csubj:outer]": 18,
279
+ "ADJ|[csubj]": 19,
280
+ "ADJ|[dep]": 20,
281
+ "ADJ|[nmod]": 21,
282
+ "ADJ|[nsubj]": 22,
283
+ "ADJ|[obj]": 23,
284
+ "ADJ|[obl]": 24,
285
+ "ADJ|[root]": 25,
286
+ "ADP": 26,
287
+ "ADP.": 27,
288
+ "ADP.|[case]": 28,
289
+ "ADP.|[fixed]": 29,
290
+ "ADP|[case]": 30,
291
+ "ADP|[fixed]": 31,
292
+ "ADV": 32,
293
+ "ADV.": 33,
294
+ "ADV.|[advcl]": 34,
295
+ "ADV.|[advmod]": 35,
296
+ "ADV.|[dep]": 36,
297
+ "ADV.|[obj]": 37,
298
+ "ADV.|[root]": 38,
299
+ "ADV|[advcl]": 39,
300
+ "ADV|[advmod]": 40,
301
+ "ADV|[dep]": 41,
302
+ "ADV|[obj]": 42,
303
+ "ADV|[root]": 43,
304
+ "AUX": 44,
305
+ "AUX.": 45,
306
+ "AUX.|Polarity=Neg": 46,
307
+ "AUX.|Polarity=Neg|[aux]": 47,
308
+ "AUX.|Polarity=Neg|[fixed]": 48,
309
+ "AUX.|[aux]": 49,
310
+ "AUX.|[cop]": 50,
311
+ "AUX.|[fixed]": 51,
312
+ "AUX.|[root]": 52,
313
+ "AUX|Polarity=Neg": 53,
314
+ "AUX|Polarity=Neg|[aux]": 54,
315
+ "AUX|Polarity=Neg|[fixed]": 55,
316
+ "AUX|[aux]": 56,
317
+ "AUX|[cop]": 57,
318
+ "AUX|[fixed]": 58,
319
+ "AUX|[root]": 59,
320
+ "CCONJ": 60,
321
+ "CCONJ.": 61,
322
+ "CCONJ.|[cc]": 62,
323
+ "CCONJ|[cc]": 63,
324
+ "DET": 64,
325
+ "DET.": 65,
326
+ "DET.|[det]": 66,
327
+ "DET|[det]": 67,
328
+ "INTJ": 68,
329
+ "INTJ.": 69,
330
+ "INTJ.|[discourse]": 70,
331
+ "INTJ.|[root]": 71,
332
+ "INTJ|[discourse]": 72,
333
+ "INTJ|[root]": 73,
334
+ "NOUN": 74,
335
+ "NOUN.": 75,
336
+ "NOUN.|Polarity=Neg": 76,
337
+ "NOUN.|Polarity=Neg|[obl]": 77,
338
+ "NOUN.|Polarity=Neg|[root]": 78,
339
+ "NOUN.|[acl]": 79,
340
+ "NOUN.|[advcl]": 80,
341
+ "NOUN.|[ccomp]": 81,
342
+ "NOUN.|[compound]": 82,
343
+ "NOUN.|[csubj:outer]": 83,
344
+ "NOUN.|[csubj]": 84,
345
+ "NOUN.|[nmod]": 85,
346
+ "NOUN.|[nsubj:outer]": 86,
347
+ "NOUN.|[nsubj]": 87,
348
+ "NOUN.|[obj]": 88,
349
+ "NOUN.|[obl]": 89,
350
+ "NOUN.|[root]": 90,
351
+ "NOUN|Polarity=Neg": 91,
352
+ "NOUN|Polarity=Neg|[obl]": 92,
353
+ "NOUN|Polarity=Neg|[root]": 93,
354
+ "NOUN|[acl]": 94,
355
+ "NOUN|[advcl]": 95,
356
+ "NOUN|[ccomp]": 96,
357
+ "NOUN|[compound]": 97,
358
+ "NOUN|[csubj:outer]": 98,
359
+ "NOUN|[csubj]": 99,
360
+ "NOUN|[nmod]": 100,
361
+ "NOUN|[nsubj:outer]": 101,
362
+ "NOUN|[nsubj]": 102,
363
+ "NOUN|[obj]": 103,
364
+ "NOUN|[obl]": 104,
365
+ "NOUN|[root]": 105,
366
+ "NUM": 106,
367
+ "NUM.": 107,
368
+ "NUM.|[advcl]": 108,
369
+ "NUM.|[compound]": 109,
370
+ "NUM.|[nmod]": 110,
371
+ "NUM.|[nsubj:outer]": 111,
372
+ "NUM.|[nsubj]": 112,
373
+ "NUM.|[nummod]": 113,
374
+ "NUM.|[obj]": 114,
375
+ "NUM.|[obl]": 115,
376
+ "NUM.|[root]": 116,
377
+ "NUM|[advcl]": 117,
378
+ "NUM|[compound]": 118,
379
+ "NUM|[nmod]": 119,
380
+ "NUM|[nsubj:outer]": 120,
381
+ "NUM|[nsubj]": 121,
382
+ "NUM|[nummod]": 122,
383
+ "NUM|[obj]": 123,
384
+ "NUM|[obl]": 124,
385
+ "NUM|[root]": 125,
386
+ "PART": 126,
387
+ "PART.": 127,
388
+ "PART.|[mark]": 128,
389
+ "PART|[mark]": 129,
390
+ "PRON": 130,
391
+ "PRON.": 131,
392
+ "PRON.|[acl]": 132,
393
+ "PRON.|[advcl]": 133,
394
+ "PRON.|[nmod]": 134,
395
+ "PRON.|[nsubj:outer]": 135,
396
+ "PRON.|[nsubj]": 136,
397
+ "PRON.|[obj]": 137,
398
+ "PRON.|[obl]": 138,
399
+ "PRON.|[root]": 139,
400
+ "PRON|[acl]": 140,
401
+ "PRON|[advcl]": 141,
402
+ "PRON|[nmod]": 142,
403
+ "PRON|[nsubj:outer]": 143,
404
+ "PRON|[nsubj]": 144,
405
+ "PRON|[obj]": 145,
406
+ "PRON|[obl]": 146,
407
+ "PRON|[root]": 147,
408
+ "PROPN": 148,
409
+ "PROPN.": 149,
410
+ "PROPN.|[acl]": 150,
411
+ "PROPN.|[advcl]": 151,
412
+ "PROPN.|[compound]": 152,
413
+ "PROPN.|[nmod]": 153,
414
+ "PROPN.|[nsubj:outer]": 154,
415
+ "PROPN.|[nsubj]": 155,
416
+ "PROPN.|[obj]": 156,
417
+ "PROPN.|[obl]": 157,
418
+ "PROPN.|[root]": 158,
419
+ "PROPN|[acl]": 159,
420
+ "PROPN|[advcl]": 160,
421
+ "PROPN|[compound]": 161,
422
+ "PROPN|[nmod]": 162,
423
+ "PROPN|[nsubj:outer]": 163,
424
+ "PROPN|[nsubj]": 164,
425
+ "PROPN|[obj]": 165,
426
+ "PROPN|[obl]": 166,
427
+ "PROPN|[root]": 167,
428
+ "PUNCT": 168,
429
+ "PUNCT.": 169,
430
+ "PUNCT.|[punct]": 170,
431
+ "PUNCT|[punct]": 171,
432
+ "SCONJ": 172,
433
+ "SCONJ.": 173,
434
+ "SCONJ.|[dep]": 174,
435
+ "SCONJ.|[fixed]": 175,
436
+ "SCONJ.|[mark]": 176,
437
+ "SCONJ|[dep]": 177,
438
+ "SCONJ|[fixed]": 178,
439
+ "SCONJ|[mark]": 179,
440
+ "SYM": 180,
441
+ "SYM.": 181,
442
+ "SYM.|[compound]": 182,
443
+ "SYM.|[dep]": 183,
444
+ "SYM.|[nmod]": 184,
445
+ "SYM.|[obl]": 185,
446
+ "SYM|[compound]": 186,
447
+ "SYM|[dep]": 187,
448
+ "SYM|[nmod]": 188,
449
+ "SYM|[obl]": 189,
450
+ "VERB": 190,
451
+ "VERB.": 191,
452
+ "VERB.|[acl]": 192,
453
+ "VERB.|[advcl]": 193,
454
+ "VERB.|[ccomp]": 194,
455
+ "VERB.|[compound]": 195,
456
+ "VERB.|[csubj:outer]": 196,
457
+ "VERB.|[csubj]": 197,
458
+ "VERB.|[nmod]": 198,
459
+ "VERB.|[obj]": 199,
460
+ "VERB.|[obl]": 200,
461
+ "VERB.|[root]": 201,
462
+ "VERB|[acl]": 202,
463
+ "VERB|[advcl]": 203,
464
+ "VERB|[ccomp]": 204,
465
+ "VERB|[compound]": 205,
466
+ "VERB|[csubj:outer]": 206,
467
+ "VERB|[csubj]": 207,
468
+ "VERB|[nmod]": 208,
469
+ "VERB|[obj]": 209,
470
+ "VERB|[obl]": 210,
471
+ "VERB|[root]": 211,
472
+ "X": 212,
473
+ "X.": 213,
474
+ "X.|[dep]": 214,
475
+ "X.|[goeswith]": 215,
476
+ "X.|[nmod]": 216,
477
+ "X|[dep]": 217,
478
+ "X|[nmod]": 218
479
  },
480
  "layer_norm_eps": 1e-05,
481
  "local_attention": 128,
maker.py CHANGED
@@ -2,43 +2,33 @@
2
  src="KoichiYasuoka/modernbert-large-japanese-wikipedia-upos"
3
  tgt="KoichiYasuoka/modernbert-large-japanese-wikipedia-ud-square"
4
  url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
- import os
6
  d=os.path.basename(url)
7
  os.system("test -d "+d+" || git clone --depth=1 "+url)
8
  os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
9
- class UDTriangularDataset(object):
10
  def __init__(self,conllu,tokenizer):
11
  self.conllu=open(conllu,"r",encoding="utf-8")
12
  self.tokenizer=tokenizer
13
  self.seeks=[0]
14
- label=set(["SYM","X"])
15
- dep=set(["X|goeswith-s","X|goeswith-t"])
16
  s=self.conllu.readline()
17
  while s!="":
18
  if s=="\n":
19
- if 0<len(self.tokenizer(t)["input_ids"])<91:
20
- pass
21
- else:
22
- self.seeks.pop(-1)
23
  self.seeks.append(self.conllu.tell())
24
- elif s.startswith("# text ="):
25
- t=s[8:].strip()
26
  else:
27
  w=s.split("\t")
28
  if len(w)==10:
29
  if w[0].isdecimal():
30
- p=w[3] if w[5]=="_" else w[3]+"|"+w[5]
31
- label.add(p)
32
- if w[6]=="0":
33
- dep.add(p+"|"+w[7])
34
- else:
35
- dep.add(p+"|"+w[7]+"-s")
36
- dep.add(p+"|"+w[7]+"-t")
37
  s=self.conllu.readline()
38
- lid={l:i for i,l in enumerate(sorted(label))}
39
- for i,d in enumerate(sorted(dep),len(lid)):
40
- lid[d]=i
41
- self.label2id=lid
42
  def __call__(*args):
43
  lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
44
  for t in args:
@@ -48,39 +38,55 @@ class UDTriangularDataset(object):
48
  self.conllu.close()
49
  __len__=lambda self:len(self.seeks)-1
50
  def __getitem__(self,i):
51
- s=self.seeks[i]
52
- self.conllu.seek(s)
53
  c,t=[],[""]
54
  while t[0]!="\n":
55
  t=self.conllu.readline().split("\t")
56
  if len(t)==10 and t[0].isdecimal():
57
  c.append(t)
 
 
 
 
58
  v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
59
  for i in range(len(v)-1,-1,-1):
60
- for j in range(1,len(v[i])):
61
- c.insert(i+1,[c[i][0],"_","_","X","_","_",c[i][0],"goeswith","_","_"])
 
 
 
 
62
  y=["0"]+[t[0] for t in c]
63
  h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
64
  p=[t[3] if t[5]=="_" else t[3]+"|"+t[5] for t in c]
65
- d=[t[7] for t in c]
 
 
 
 
 
 
 
 
 
 
 
66
  v=sum(v,[])
67
- ids=[self.tokenizer.cls_token_id]
68
- upos=["SYM"]
69
  for i in range(len(v)):
70
- for j in range(len(v)):
71
- ids.append(v[j])
72
- if i==j:
73
- upos.append(p[i]+"|"+d[i] if h[i]==j+1 else p[i]+"|"+d[i]+"-t")
74
- else:
75
- upos.append(p[i]+"|"+d[i]+"-s" if h[i]==j+1 else p[j])
76
- ids.append(self.tokenizer.sep_token_id)
77
- upos.append("SYM")
78
  return {"input_ids":ids,"labels":[self.label2id[p] for p in upos]}
79
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
80
  tkz=AutoTokenizer.from_pretrained(src)
81
- trainDS=UDTriangularDataset("train.conllu",tkz)
82
- devDS=UDTriangularDataset("dev.conllu",tkz)
83
- testDS=UDTriangularDataset("test.conllu",tkz)
84
  lid=trainDS(devDS,testDS)
85
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
86
  mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
 
2
  src="KoichiYasuoka/modernbert-large-japanese-wikipedia-upos"
3
  tgt="KoichiYasuoka/modernbert-large-japanese-wikipedia-ud-square"
4
  url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
5
+ import os,numpy
6
  d=os.path.basename(url)
7
  os.system("test -d "+d+" || git clone --depth=1 "+url)
8
  os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
9
+ class UDSquareDataset(object):
10
  def __init__(self,conllu,tokenizer):
11
  self.conllu=open(conllu,"r",encoding="utf-8")
12
  self.tokenizer=tokenizer
13
  self.seeks=[0]
14
+ label=set(["SYM.","X.","X.|[goeswith]"])
 
15
  s=self.conllu.readline()
16
  while s!="":
17
  if s=="\n":
 
 
 
 
18
  self.seeks.append(self.conllu.tell())
 
 
19
  else:
20
  w=s.split("\t")
21
  if len(w)==10:
22
  if w[0].isdecimal():
23
+ p=w[3]
24
+ q="" if w[5]=="_" else "|"+w[5]
25
+ r="|["+w[7]+"]"
26
+ label.add(p+q)
27
+ label.add(p+"."+q)
28
+ label.add(p+q+r)
29
+ label.add(p+"."+q+r)
30
  s=self.conllu.readline()
31
+ self.label2id={l:i for i,l in enumerate(sorted(label))}
 
 
 
32
  def __call__(*args):
33
  lid={l:i for i,l in enumerate(sorted(set(sum([list(t.label2id) for t in args],[]))))}
34
  for t in args:
 
38
  self.conllu.close()
39
  __len__=lambda self:len(self.seeks)-1
40
  def __getitem__(self,i):
41
+ self.conllu.seek(self.seeks[i])
 
42
  c,t=[],[""]
43
  while t[0]!="\n":
44
  t=self.conllu.readline().split("\t")
45
  if len(t)==10 and t[0].isdecimal():
46
  c.append(t)
47
+ h={t[6] for t in c}
48
+ for t in c:
49
+ if t[6]!="0" and t[0] not in h:
50
+ t[3]+="."
51
  v=self.tokenizer([t[1] for t in c],add_special_tokens=False)["input_ids"]
52
  for i in range(len(v)-1,-1,-1):
53
+ if v[i]==[]:
54
+ v[i]=[self.tokenizer.unk_token_id]
55
+ if len(v[i])>1:
56
+ c[i][3]=c[i][3].replace(".","")
57
+ for j in range(1,len(v[i])):
58
+ c.insert(i+1,[c[i][0],"_","_","X.","_","_",c[i][0],"goeswith","_","_"])
59
  y=["0"]+[t[0] for t in c]
60
  h=[i if t[6]=="0" else y.index(t[6]) for i,t in enumerate(c,1)]
61
  p=[t[3] if t[5]=="_" else t[3]+"|"+t[5] for t in c]
62
+ d=["|["+t[7]+"]" for t in c]
63
+ x=[not t[3].endswith(".") for t in c]
64
+ if len(x)<90:
65
+ x=[True]*len(x)
66
+ else:
67
+ w=(sum([1 for b in x if b])+1)*(len(x)+1)+1
68
+ for i in numpy.argsort([-abs(j-i-1) for i,j in enumerate(h)]):
69
+ if w+len(x)>8191:
70
+ break
71
+ if not x[i]:
72
+ x[i]=True
73
+ w+=len(x)+1
74
  v=sum(v,[])
75
+ ids=[self.tokenizer.cls_token_id]+v+[self.tokenizer.sep_token_id]
76
+ upos=["SYM."]+p+["SYM."]
77
  for i in range(len(v)):
78
+ if x[i]:
79
+ for j in range(len(v)):
80
+ ids.append(self.tokenizer.mask_token_id if i==j else v[j])
81
+ upos.append(p[j]+d[j] if h[j]==i+1 else p[j])
82
+ ids.append(self.tokenizer.sep_token_id)
83
+ upos.append("SYM.")
 
 
84
  return {"input_ids":ids,"labels":[self.label2id[p] for p in upos]}
85
  from transformers import AutoTokenizer,AutoConfig,AutoModelForTokenClassification,DataCollatorForTokenClassification,TrainingArguments,Trainer
86
  tkz=AutoTokenizer.from_pretrained(src)
87
+ trainDS=UDSquareDataset("train.conllu",tkz)
88
+ devDS=UDSquareDataset("dev.conllu",tkz)
89
+ testDS=UDSquareDataset("test.conllu",tkz)
90
  lid=trainDS(devDS,testDS)
91
  cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True,trust_remote_code=True)
92
  mdl=AutoModelForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True,trust_remote_code=True)
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4be01ad1760cae99e4954f7d022fddce93f9da9cb1f9fa25b0b65281ebf2140b
3
- size 1644098178
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cd7aff6611ce073b2bc046f702814d6c3c698086fdafc2267b23df981d3df60
3
+ size 1644212930
ud.py CHANGED
@@ -5,55 +5,79 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
5
  def __init__(self,**kwargs):
6
  super().__init__(**kwargs)
7
  x=self.model.config.label2id
8
- self.root=numpy.full((len(x)),numpy.nan)
9
- self.arc_start=numpy.full((len(x)),numpy.nan)
10
- self.arc_tail=numpy.full((len(x)),numpy.nan)
11
  for k,v in x.items():
12
- if k.endswith("|root"):
13
  self.root[v]=0
14
- elif k.endswith("-s"):
15
- self.arc_start[v]=0
16
- elif k.endswith("-t"):
17
- self.arc_tail[v]=0
18
  def _forward(self,model_inputs):
19
  import torch
20
  v=model_inputs["input_ids"][0].tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  with torch.no_grad():
22
- e=self.model(input_ids=torch.tensor([v+v[1:]*(len(v)-3)]).to(self.device))
23
- return {"logits":e.logits,**model_inputs}
24
  def check_model_type(self,supported_models):
25
  pass
26
  def postprocess(self,model_outputs,**kwargs):
27
  if "logits" not in model_outputs:
28
  return "".join(self.postprocess(x,**kwargs) for x in model_outputs)
29
  m=model_outputs["logits"][0].cpu().numpy()
30
- w=len(model_outputs["input_ids"][0])-2
31
- e=numpy.zeros((w,w,m.shape[-1]))
32
- for i in range(w):
33
- k=numpy.roll(m[i*(w+2)+1]+self.arc_tail,-1)
34
- for j in range(w):
35
- if i==j:
36
- e[i,i]=m[i*(w+1)+j+1]+self.root
37
- else:
38
- e[j,i]=m[i*(w+1)+j+1]+self.arc_start+k
39
- g=self.model.config.label2id["X|goeswith-s"]
40
- r=numpy.tri(e.shape[0])
 
 
 
41
  for i in range(e.shape[0]):
42
  for j in range(i+2,e.shape[1]):
43
- r[i,j]=r[i,j-1] if numpy.nanargmax(e[i,j-1])==g else 1
44
- e[:,:,g]+=numpy.where(r==0,0,numpy.nan)
45
- m,p=numpy.nanmax(e,axis=2),numpy.nanargmax(e,axis=2)
 
 
 
46
  h=self.chu_liu_edmonds(m)
47
  z=[i for i,j in enumerate(h) if i==j]
48
  if len(z)>1:
49
- k,h=z[numpy.nanargmax(m[z,z])],numpy.nanmin(m)-numpy.nanmax(m)
50
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
51
  h=self.chu_liu_edmonds(m)
52
  v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
53
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
54
  if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
55
  for i,j in reversed(list(enumerate(q[1:],1))):
56
- if j[-1]=="goeswith-s" and set([t[-1] for t in q[h[i]+1:i+1]])=={"goeswith-s"}:
57
  h=[b if i>b else b-1 for a,b in enumerate(h) if i!=a]
58
  v[i-1]=(v[i-1][0],v.pop(i)[1])
59
  q.pop(i)
@@ -64,10 +88,11 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
64
  t=model_outputs["sentence"].replace("\n"," ")
65
  u="# text = "+t+"\n"
66
  for i,(s,e) in enumerate(v):
67
- u+="\t".join([str(i+1),t[s:e],"_",q[i][0],"_","_" if len(q[i])<3 else "|".join(q[i][1:-1]),str(0 if h[i]==i else h[i]+1),"root" if q[i][-1]=="root" else q[i][-1][0:-2],"_","_" if i+1<len(v) and e<v[i+1][0] else "SpaceAfter=No"])+"\n"
68
  return u+"\n"
69
  def chu_liu_edmonds(self,matrix):
70
- h=numpy.nanargmax(matrix,axis=0)
 
71
  x=[-1 if i==j else j for i,j in enumerate(h)]
72
  for b in [lambda x,i,j:-1 if i not in x else x[i],lambda x,i,j:-1 if j<0 else x[j]]:
73
  y=[]
@@ -78,10 +103,10 @@ class UniversalDependenciesPipeline(TokenClassificationPipeline):
78
  if max(x)<0:
79
  return h
80
  y,x=[i for i,j in enumerate(x) if j==max(x)],[i for i,j in enumerate(x) if j<max(x)]
81
- z=matrix-numpy.nanmax(matrix,axis=0)
82
- m=numpy.block([[z[x,:][:,x],numpy.nanmax(z[x,:][:,y],axis=1).reshape(len(x),1)],[numpy.nanmax(z[y,:][:,x],axis=0),numpy.nanmax(z[y,y])]])
83
- k=[j if i==len(x) else x[j] if j<len(x) else y[numpy.nanargmax(z[y,x[i]])] for i,j in enumerate(self.chu_liu_edmonds(m))]
84
  h=[j if i in y else k[x.index(i)] for i,j in enumerate(h)]
85
- i=y[numpy.nanargmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
86
  h[i]=x[k[-1]] if k[-1]<len(x) else i
87
  return h
 
5
  def __init__(self,**kwargs):
6
  super().__init__(**kwargs)
7
  x=self.model.config.label2id
8
+ self.root=numpy.full((len(x)),-numpy.inf)
9
+ self.arc=numpy.full((len(x)),-numpy.inf)
 
10
  for k,v in x.items():
11
+ if k.endswith("|[root]"):
12
  self.root[v]=0
13
+ elif k.endswith("]"):
14
+ self.arc[v]=0
 
 
15
  def _forward(self,model_inputs):
16
  import torch
17
  v=model_inputs["input_ids"][0].tolist()
18
+ if len(v)<91:
19
+ x=[True]*(len(v)-2)
20
+ else:
21
+ with torch.no_grad():
22
+ e=self.model(input_ids=torch.tensor([v]).to(self.device))
23
+ m=e.logits[0].cpu().numpy()
24
+ e=numpy.exp(m-numpy.max(m,axis=-1,keepdims=True))
25
+ z=e/e.sum(axis=-1,keepdims=True)
26
+ k=numpy.argmax(m,axis=1).tolist()
27
+ x=[not self.model.config.id2label[p].split("|")[0].endswith(".") for p in k[1:-1]]
28
+ w=(sum([1 for b in x if b])+1)*(len(x)+1)+1
29
+ for i in numpy.argsort([z[i+1,k[i+1]] for i in range(len(x))]):
30
+ if w+len(x)>8191:
31
+ break
32
+ if not x[i]:
33
+ x[i]=True
34
+ w+=len(x)+1
35
+ ids=list(v)
36
+ for i in range(len(x)):
37
+ if x[i]:
38
+ ids+=v[1:i+1]+[self.tokenizer.mask_token_id]+v[i+2:]
39
  with torch.no_grad():
40
+ e=self.model(input_ids=torch.tensor([ids]).to(self.device))
41
+ return {"logits":e.logits,"thin_out":x,**model_inputs}
42
  def check_model_type(self,supported_models):
43
  pass
44
  def postprocess(self,model_outputs,**kwargs):
45
  if "logits" not in model_outputs:
46
  return "".join(self.postprocess(x,**kwargs) for x in model_outputs)
47
  m=model_outputs["logits"][0].cpu().numpy()
48
+ x=model_outputs["thin_out"]
49
+ e=numpy.full((len(x),len(x),m.shape[-1]),m.min())
50
+ k=len(x)+2
51
+ for i in range(len(x)):
52
+ if x[i]:
53
+ for j in range(len(x)):
54
+ if i==j:
55
+ e[i,i]=m[k]+self.root
56
+ else:
57
+ e[i,j]=m[k]+self.arc
58
+ k+=1
59
+ k+=1
60
+ g=self.model.config.label2id["X.|[goeswith]"]
61
+ m,r=numpy.max(e,axis=2),numpy.tri(e.shape[0])
62
  for i in range(e.shape[0]):
63
  for j in range(i+2,e.shape[1]):
64
+ r[i,j]=1
65
+ if numpy.argmax(e[i,j-1])==g:
66
+ if numpy.argmax(m[:,j-1])==i:
67
+ r[i,j]=r[i,j-1]
68
+ e[:,:,g]+=numpy.where(r==0,0,-numpy.inf)
69
+ m,p=numpy.max(e,axis=2),numpy.argmax(e,axis=2)
70
  h=self.chu_liu_edmonds(m)
71
  z=[i for i,j in enumerate(h) if i==j]
72
  if len(z)>1:
73
+ k,h=z[numpy.argmax(m[z,z])],numpy.min(m)-numpy.max(m)
74
  m[:,z]+=[[0 if j in z and (i!=j or i==k) else h for i in z] for j in range(m.shape[0])]
75
  h=self.chu_liu_edmonds(m)
76
  v=[(s,e) for s,e in model_outputs["offset_mapping"][0].tolist() if s<e]
77
  q=[self.model.config.id2label[p[j,i]].split("|") for i,j in enumerate(h)]
78
  if "aggregation_strategy" in kwargs and kwargs["aggregation_strategy"]!="none":
79
  for i,j in reversed(list(enumerate(q[1:],1))):
80
+ if j[-1]=="[goeswith]" and set([t[-1] for t in q[h[i]+1:i+1]])=={"[goeswith]"}:
81
  h=[b if i>b else b-1 for a,b in enumerate(h) if i!=a]
82
  v[i-1]=(v[i-1][0],v.pop(i)[1])
83
  q.pop(i)
 
88
  t=model_outputs["sentence"].replace("\n"," ")
89
  u="# text = "+t+"\n"
90
  for i,(s,e) in enumerate(v):
91
+ u+="\t".join([str(i+1),t[s:e],"_",q[i][0].replace(".",""),"_","_" if len(q[i])<3 else "|".join(q[i][1:-1]),str(0 if h[i]==i else h[i]+1),q[i][-1][1:-1],"_","_" if i+1<len(v) and e<v[i+1][0] else "SpaceAfter=No"])+"\n"
92
  return u+"\n"
93
  def chu_liu_edmonds(self,matrix):
94
+ import numpy
95
+ h=numpy.argmax(matrix,axis=0)
96
  x=[-1 if i==j else j for i,j in enumerate(h)]
97
  for b in [lambda x,i,j:-1 if i not in x else x[i],lambda x,i,j:-1 if j<0 else x[j]]:
98
  y=[]
 
103
  if max(x)<0:
104
  return h
105
  y,x=[i for i,j in enumerate(x) if j==max(x)],[i for i,j in enumerate(x) if j<max(x)]
106
+ z=matrix-numpy.max(matrix,axis=0)
107
+ m=numpy.block([[z[x,:][:,x],numpy.max(z[x,:][:,y],axis=1).reshape(len(x),1)],[numpy.max(z[y,:][:,x],axis=0),numpy.max(z[y,y])]])
108
+ k=[j if i==len(x) else x[j] if j<len(x) else y[numpy.argmax(z[y,x[i]])] for i,j in enumerate(self.chu_liu_edmonds(m))]
109
  h=[j if i in y else k[x.index(i)] for i,j in enumerate(h)]
110
+ i=y[numpy.argmax(z[x[k[-1]],y] if k[-1]<len(x) else z[y,y])]
111
  h[i]=x[k[-1]] if k[-1]<len(x) else i
112
  return h