guipenedo HF staff commited on
Commit
bd41049
·
unverified ·
1 Parent(s): 0741edf

macrolanguages fix

Browse files
data/Afro-Asiatic.json CHANGED
@@ -6035,9 +6035,19 @@
6035
  "iso_1_code": "ar",
6036
  "iso_3_code": "acm",
6037
  "children": [],
6038
- "tokenizers": {},
 
 
 
 
 
 
 
 
6039
  "node_i": "487",
6040
- "native_tokenizers": [],
 
 
6041
  "scripts": [
6042
  "Arab"
6043
  ]
@@ -6097,9 +6107,19 @@
6097
  "iso_1_code": "ar",
6098
  "iso_3_code": "aeb",
6099
  "children": [],
6100
- "tokenizers": {},
 
 
 
 
 
 
 
 
6101
  "node_i": "493",
6102
- "native_tokenizers": [],
 
 
6103
  "scripts": [
6104
  "Arab"
6105
  ]
@@ -6139,9 +6159,19 @@
6139
  "iso_1_code": "ar",
6140
  "iso_3_code": "apc",
6141
  "children": [],
6142
- "tokenizers": {},
 
 
 
 
 
 
 
 
6143
  "node_i": "497",
6144
- "native_tokenizers": [],
 
 
6145
  "scripts": [
6146
  "Arab"
6147
  ]
@@ -6162,6 +6192,13 @@
6162
  "iso_3_code": "arb",
6163
  "children": [],
6164
  "tokenizers": {
 
 
 
 
 
 
 
6165
  "Latn": {
6166
  "full_object": "StanzaTokenizer(\"mt\")",
6167
  "original_lang_name": "maltese",
@@ -6171,7 +6208,9 @@
6171
  }
6172
  },
6173
  "node_i": "499",
6174
- "native_tokenizers": [],
 
 
6175
  "scripts": [
6176
  "Arab",
6177
  "Latn"
@@ -6182,9 +6221,19 @@
6182
  "iso_1_code": "ar",
6183
  "iso_3_code": "arq",
6184
  "children": [],
6185
- "tokenizers": {},
 
 
 
 
 
 
 
 
6186
  "node_i": "500",
6187
- "native_tokenizers": [],
 
 
6188
  "scripts": [
6189
  "Arab"
6190
  ]
@@ -6194,9 +6243,19 @@
6194
  "iso_1_code": "ar",
6195
  "iso_3_code": "ars",
6196
  "children": [],
6197
- "tokenizers": {},
 
 
 
 
 
 
 
 
6198
  "node_i": "501",
6199
- "native_tokenizers": [],
 
 
6200
  "scripts": [
6201
  "Arab"
6202
  ]
@@ -6206,9 +6265,19 @@
6206
  "iso_1_code": "ar",
6207
  "iso_3_code": "ary",
6208
  "children": [],
6209
- "tokenizers": {},
 
 
 
 
 
 
 
 
6210
  "node_i": "502",
6211
- "native_tokenizers": [],
 
 
6212
  "scripts": [
6213
  "Arab"
6214
  ]
@@ -6218,9 +6287,19 @@
6218
  "iso_1_code": "ar",
6219
  "iso_3_code": "arz",
6220
  "children": [],
6221
- "tokenizers": {},
 
 
 
 
 
 
 
 
6222
  "node_i": "503",
6223
- "native_tokenizers": [],
 
 
6224
  "scripts": [
6225
  "Arab"
6226
  ]
@@ -6280,9 +6359,19 @@
6280
  "iso_1_code": "ar",
6281
  "iso_3_code": "ayp",
6282
  "children": [],
6283
- "tokenizers": {},
 
 
 
 
 
 
 
 
6284
  "node_i": "509",
6285
- "native_tokenizers": [],
 
 
6286
  "scripts": [
6287
  "Arab"
6288
  ]
@@ -6334,9 +6423,19 @@
6334
  "iso_1_code": "ar",
6335
  "iso_3_code": "shu",
6336
  "children": [],
6337
- "tokenizers": {},
 
 
 
 
 
 
 
 
6338
  "node_i": "513",
6339
- "native_tokenizers": [],
 
 
6340
  "scripts": [
6341
  "Arab"
6342
  ]
@@ -6373,6 +6472,13 @@
6373
  }
6374
  ],
6375
  "tokenizers": {
 
 
 
 
 
 
 
6376
  "Latn": {
6377
  "full_object": "StanzaTokenizer(\"mt\")",
6378
  "original_lang_name": "maltese",
@@ -6460,6 +6566,13 @@
6460
  }
6461
  ],
6462
  "tokenizers": {
 
 
 
 
 
 
 
6463
  "Latn": {
6464
  "full_object": "StanzaTokenizer(\"mt\")",
6465
  "original_lang_name": "maltese",
@@ -6481,6 +6594,13 @@
6481
  }
6482
  ],
6483
  "tokenizers": {
 
 
 
 
 
 
 
6484
  "Latn": {
6485
  "full_object": "StanzaTokenizer(\"mt\")",
6486
  "original_lang_name": "maltese",
@@ -6951,6 +7071,13 @@
6951
  }
6952
  ],
6953
  "tokenizers": {
 
 
 
 
 
 
 
6954
  "Latn": {
6955
  "full_object": "StanzaTokenizer(\"mt\")",
6956
  "original_lang_name": "maltese",
@@ -7007,6 +7134,13 @@
7007
  "script": "Copt",
7008
  "class_name": "StanzaTokenizer"
7009
  },
 
 
 
 
 
 
 
7010
  "Latn": {
7011
  "full_object": "StanzaTokenizer(\"mt\")",
7012
  "original_lang_name": "maltese",
 
6035
  "iso_1_code": "ar",
6036
  "iso_3_code": "acm",
6037
  "children": [],
6038
+ "tokenizers": {
6039
+ "Arab": {
6040
+ "full_object": "SpaCyTokenizer(\"ar\")",
6041
+ "original_lang_name": "arabic",
6042
+ "original_lang_code": "ara",
6043
+ "script": "Arab",
6044
+ "class_name": "SpaCyTokenizer"
6045
+ }
6046
+ },
6047
  "node_i": "487",
6048
+ "native_tokenizers": [
6049
+ "Arab"
6050
+ ],
6051
  "scripts": [
6052
  "Arab"
6053
  ]
 
6107
  "iso_1_code": "ar",
6108
  "iso_3_code": "aeb",
6109
  "children": [],
6110
+ "tokenizers": {
6111
+ "Arab": {
6112
+ "full_object": "SpaCyTokenizer(\"ar\")",
6113
+ "original_lang_name": "arabic",
6114
+ "original_lang_code": "ara",
6115
+ "script": "Arab",
6116
+ "class_name": "SpaCyTokenizer"
6117
+ }
6118
+ },
6119
  "node_i": "493",
6120
+ "native_tokenizers": [
6121
+ "Arab"
6122
+ ],
6123
  "scripts": [
6124
  "Arab"
6125
  ]
 
6159
  "iso_1_code": "ar",
6160
  "iso_3_code": "apc",
6161
  "children": [],
6162
+ "tokenizers": {
6163
+ "Arab": {
6164
+ "full_object": "SpaCyTokenizer(\"ar\")",
6165
+ "original_lang_name": "arabic",
6166
+ "original_lang_code": "ara",
6167
+ "script": "Arab",
6168
+ "class_name": "SpaCyTokenizer"
6169
+ }
6170
+ },
6171
  "node_i": "497",
6172
+ "native_tokenizers": [
6173
+ "Arab"
6174
+ ],
6175
  "scripts": [
6176
  "Arab"
6177
  ]
 
6192
  "iso_3_code": "arb",
6193
  "children": [],
6194
  "tokenizers": {
6195
+ "Arab": {
6196
+ "full_object": "SpaCyTokenizer(\"ar\")",
6197
+ "original_lang_name": "arabic",
6198
+ "original_lang_code": "ara",
6199
+ "script": "Arab",
6200
+ "class_name": "SpaCyTokenizer"
6201
+ },
6202
  "Latn": {
6203
  "full_object": "StanzaTokenizer(\"mt\")",
6204
  "original_lang_name": "maltese",
 
6208
  }
6209
  },
6210
  "node_i": "499",
6211
+ "native_tokenizers": [
6212
+ "Arab"
6213
+ ],
6214
  "scripts": [
6215
  "Arab",
6216
  "Latn"
 
6221
  "iso_1_code": "ar",
6222
  "iso_3_code": "arq",
6223
  "children": [],
6224
+ "tokenizers": {
6225
+ "Arab": {
6226
+ "full_object": "SpaCyTokenizer(\"ar\")",
6227
+ "original_lang_name": "arabic",
6228
+ "original_lang_code": "ara",
6229
+ "script": "Arab",
6230
+ "class_name": "SpaCyTokenizer"
6231
+ }
6232
+ },
6233
  "node_i": "500",
6234
+ "native_tokenizers": [
6235
+ "Arab"
6236
+ ],
6237
  "scripts": [
6238
  "Arab"
6239
  ]
 
6243
  "iso_1_code": "ar",
6244
  "iso_3_code": "ars",
6245
  "children": [],
6246
+ "tokenizers": {
6247
+ "Arab": {
6248
+ "full_object": "SpaCyTokenizer(\"ar\")",
6249
+ "original_lang_name": "arabic",
6250
+ "original_lang_code": "ara",
6251
+ "script": "Arab",
6252
+ "class_name": "SpaCyTokenizer"
6253
+ }
6254
+ },
6255
  "node_i": "501",
6256
+ "native_tokenizers": [
6257
+ "Arab"
6258
+ ],
6259
  "scripts": [
6260
  "Arab"
6261
  ]
 
6265
  "iso_1_code": "ar",
6266
  "iso_3_code": "ary",
6267
  "children": [],
6268
+ "tokenizers": {
6269
+ "Arab": {
6270
+ "full_object": "SpaCyTokenizer(\"ar\")",
6271
+ "original_lang_name": "arabic",
6272
+ "original_lang_code": "ara",
6273
+ "script": "Arab",
6274
+ "class_name": "SpaCyTokenizer"
6275
+ }
6276
+ },
6277
  "node_i": "502",
6278
+ "native_tokenizers": [
6279
+ "Arab"
6280
+ ],
6281
  "scripts": [
6282
  "Arab"
6283
  ]
 
6287
  "iso_1_code": "ar",
6288
  "iso_3_code": "arz",
6289
  "children": [],
6290
+ "tokenizers": {
6291
+ "Arab": {
6292
+ "full_object": "SpaCyTokenizer(\"ar\")",
6293
+ "original_lang_name": "arabic",
6294
+ "original_lang_code": "ara",
6295
+ "script": "Arab",
6296
+ "class_name": "SpaCyTokenizer"
6297
+ }
6298
+ },
6299
  "node_i": "503",
6300
+ "native_tokenizers": [
6301
+ "Arab"
6302
+ ],
6303
  "scripts": [
6304
  "Arab"
6305
  ]
 
6359
  "iso_1_code": "ar",
6360
  "iso_3_code": "ayp",
6361
  "children": [],
6362
+ "tokenizers": {
6363
+ "Arab": {
6364
+ "full_object": "SpaCyTokenizer(\"ar\")",
6365
+ "original_lang_name": "arabic",
6366
+ "original_lang_code": "ara",
6367
+ "script": "Arab",
6368
+ "class_name": "SpaCyTokenizer"
6369
+ }
6370
+ },
6371
  "node_i": "509",
6372
+ "native_tokenizers": [
6373
+ "Arab"
6374
+ ],
6375
  "scripts": [
6376
  "Arab"
6377
  ]
 
6423
  "iso_1_code": "ar",
6424
  "iso_3_code": "shu",
6425
  "children": [],
6426
+ "tokenizers": {
6427
+ "Arab": {
6428
+ "full_object": "SpaCyTokenizer(\"ar\")",
6429
+ "original_lang_name": "arabic",
6430
+ "original_lang_code": "ara",
6431
+ "script": "Arab",
6432
+ "class_name": "SpaCyTokenizer"
6433
+ }
6434
+ },
6435
  "node_i": "513",
6436
+ "native_tokenizers": [
6437
+ "Arab"
6438
+ ],
6439
  "scripts": [
6440
  "Arab"
6441
  ]
 
6472
  }
6473
  ],
6474
  "tokenizers": {
6475
+ "Arab": {
6476
+ "full_object": "SpaCyTokenizer(\"ar\")",
6477
+ "original_lang_name": "arabic",
6478
+ "original_lang_code": "ara",
6479
+ "script": "Arab",
6480
+ "class_name": "SpaCyTokenizer"
6481
+ },
6482
  "Latn": {
6483
  "full_object": "StanzaTokenizer(\"mt\")",
6484
  "original_lang_name": "maltese",
 
6566
  }
6567
  ],
6568
  "tokenizers": {
6569
+ "Arab": {
6570
+ "full_object": "SpaCyTokenizer(\"ar\")",
6571
+ "original_lang_name": "arabic",
6572
+ "original_lang_code": "ara",
6573
+ "script": "Arab",
6574
+ "class_name": "SpaCyTokenizer"
6575
+ },
6576
  "Latn": {
6577
  "full_object": "StanzaTokenizer(\"mt\")",
6578
  "original_lang_name": "maltese",
 
6594
  }
6595
  ],
6596
  "tokenizers": {
6597
+ "Arab": {
6598
+ "full_object": "SpaCyTokenizer(\"ar\")",
6599
+ "original_lang_name": "arabic",
6600
+ "original_lang_code": "ara",
6601
+ "script": "Arab",
6602
+ "class_name": "SpaCyTokenizer"
6603
+ },
6604
  "Latn": {
6605
  "full_object": "StanzaTokenizer(\"mt\")",
6606
  "original_lang_name": "maltese",
 
7071
  }
7072
  ],
7073
  "tokenizers": {
7074
+ "Arab": {
7075
+ "full_object": "SpaCyTokenizer(\"ar\")",
7076
+ "original_lang_name": "arabic",
7077
+ "original_lang_code": "ara",
7078
+ "script": "Arab",
7079
+ "class_name": "SpaCyTokenizer"
7080
+ },
7081
  "Latn": {
7082
  "full_object": "StanzaTokenizer(\"mt\")",
7083
  "original_lang_name": "maltese",
 
7134
  "script": "Copt",
7135
  "class_name": "StanzaTokenizer"
7136
  },
7137
+ "Arab": {
7138
+ "full_object": "SpaCyTokenizer(\"ar\")",
7139
+ "original_lang_name": "arabic",
7140
+ "original_lang_code": "ara",
7141
+ "script": "Arab",
7142
+ "class_name": "SpaCyTokenizer"
7143
+ },
7144
  "Latn": {
7145
  "full_object": "StanzaTokenizer(\"mt\")",
7146
  "original_lang_name": "maltese",
data/Austronesian.json CHANGED
The diff for this file is too large to render. See raw diff
 
data/Creole.json CHANGED
@@ -833,11 +833,11 @@
833
  "children": [],
834
  "tokenizers": {
835
  "Latn": {
836
- "full_object": "StanzaTokenizer(\"pcm\")",
837
- "original_lang_name": "nigerian_pidgin",
838
- "original_lang_code": "pcm",
839
  "script": "Latn",
840
- "class_name": "StanzaTokenizer"
841
  }
842
  },
843
  "node_i": "3540",
@@ -863,11 +863,11 @@
863
  "children": [],
864
  "tokenizers": {
865
  "Latn": {
866
- "full_object": "StanzaTokenizer(\"pcm\")",
867
- "original_lang_name": "nigerian_pidgin",
868
- "original_lang_code": "pcm",
869
  "script": "Latn",
870
- "class_name": "StanzaTokenizer"
871
  }
872
  },
873
  "node_i": "3542",
@@ -883,11 +883,11 @@
883
  "children": [],
884
  "tokenizers": {
885
  "Latn": {
886
- "full_object": "StanzaTokenizer(\"pcm\")",
887
- "original_lang_name": "nigerian_pidgin",
888
- "original_lang_code": "pcm",
889
  "script": "Latn",
890
- "class_name": "StanzaTokenizer"
891
  }
892
  },
893
  "node_i": "3543",
@@ -903,11 +903,11 @@
903
  "children": [],
904
  "tokenizers": {
905
  "Latn": {
906
- "full_object": "StanzaTokenizer(\"pcm\")",
907
- "original_lang_name": "nigerian_pidgin",
908
- "original_lang_code": "pcm",
909
  "script": "Latn",
910
- "class_name": "StanzaTokenizer"
911
  }
912
  },
913
  "node_i": "3544",
@@ -923,11 +923,11 @@
923
  "children": [],
924
  "tokenizers": {
925
  "Latn": {
926
- "full_object": "StanzaTokenizer(\"pcm\")",
927
- "original_lang_name": "nigerian_pidgin",
928
- "original_lang_code": "pcm",
929
  "script": "Latn",
930
- "class_name": "StanzaTokenizer"
931
  }
932
  },
933
  "node_i": "3545",
@@ -963,11 +963,11 @@
963
  "children": [],
964
  "tokenizers": {
965
  "Latn": {
966
- "full_object": "StanzaTokenizer(\"pcm\")",
967
- "original_lang_name": "nigerian_pidgin",
968
- "original_lang_code": "pcm",
969
  "script": "Latn",
970
- "class_name": "StanzaTokenizer"
971
  }
972
  },
973
  "node_i": "3548",
@@ -983,11 +983,11 @@
983
  "children": [],
984
  "tokenizers": {
985
  "Latn": {
986
- "full_object": "StanzaTokenizer(\"pcm\")",
987
- "original_lang_name": "nigerian_pidgin",
988
- "original_lang_code": "pcm",
989
  "script": "Latn",
990
- "class_name": "StanzaTokenizer"
991
  }
992
  },
993
  "node_i": "3549",
@@ -1009,11 +1009,11 @@
1009
  ],
1010
  "tokenizers": {
1011
  "Latn": {
1012
- "full_object": "StanzaTokenizer(\"pcm\")",
1013
- "original_lang_name": "nigerian_pidgin",
1014
- "original_lang_code": "pcm",
1015
  "script": "Latn",
1016
- "class_name": "StanzaTokenizer"
1017
  }
1018
  },
1019
  "node_i": "3539",
@@ -1074,11 +1074,11 @@
1074
  "children": [],
1075
  "tokenizers": {
1076
  "Latn": {
1077
- "full_object": "StanzaTokenizer(\"pcm\")",
1078
- "original_lang_name": "nigerian_pidgin",
1079
- "original_lang_code": "pcm",
1080
  "script": "Latn",
1081
- "class_name": "StanzaTokenizer"
1082
  }
1083
  },
1084
  "node_i": "3556",
@@ -1090,11 +1090,11 @@
1090
  ],
1091
  "tokenizers": {
1092
  "Latn": {
1093
- "full_object": "StanzaTokenizer(\"pcm\")",
1094
- "original_lang_name": "nigerian_pidgin",
1095
- "original_lang_code": "pcm",
1096
  "script": "Latn",
1097
- "class_name": "StanzaTokenizer"
1098
  }
1099
  },
1100
  "node_i": "3555",
@@ -1134,11 +1134,11 @@
1134
  "children": [],
1135
  "tokenizers": {
1136
  "Latn": {
1137
- "full_object": "StanzaTokenizer(\"pcm\")",
1138
- "original_lang_name": "nigerian_pidgin",
1139
- "original_lang_code": "pcm",
1140
  "script": "Latn",
1141
- "class_name": "StanzaTokenizer"
1142
  }
1143
  },
1144
  "node_i": "3560",
@@ -1160,11 +1160,11 @@
1160
  ],
1161
  "tokenizers": {
1162
  "Latn": {
1163
- "full_object": "StanzaTokenizer(\"pcm\")",
1164
- "original_lang_name": "nigerian_pidgin",
1165
- "original_lang_code": "pcm",
1166
  "script": "Latn",
1167
- "class_name": "StanzaTokenizer"
1168
  }
1169
  },
1170
  "node_i": "3559",
@@ -1183,11 +1183,11 @@
1183
  "children": [],
1184
  "tokenizers": {
1185
  "Latn": {
1186
- "full_object": "StanzaTokenizer(\"pcm\")",
1187
- "original_lang_name": "nigerian_pidgin",
1188
- "original_lang_code": "pcm",
1189
  "script": "Latn",
1190
- "class_name": "StanzaTokenizer"
1191
  }
1192
  },
1193
  "node_i": "3563",
@@ -1203,11 +1203,11 @@
1203
  "children": [],
1204
  "tokenizers": {
1205
  "Latn": {
1206
- "full_object": "StanzaTokenizer(\"pcm\")",
1207
- "original_lang_name": "nigerian_pidgin",
1208
- "original_lang_code": "pcm",
1209
  "script": "Latn",
1210
- "class_name": "StanzaTokenizer"
1211
  }
1212
  },
1213
  "node_i": "3564",
@@ -1263,15 +1263,17 @@
1263
  "children": [],
1264
  "tokenizers": {
1265
  "Latn": {
1266
- "full_object": "StanzaTokenizer(\"pcm\")",
1267
- "original_lang_name": "nigerian_pidgin",
1268
- "original_lang_code": "pcm",
1269
  "script": "Latn",
1270
- "class_name": "StanzaTokenizer"
1271
  }
1272
  },
1273
  "node_i": "3569",
1274
- "native_tokenizers": [],
 
 
1275
  "scripts": [
1276
  "Latn"
1277
  ]
@@ -1283,11 +1285,11 @@
1283
  "children": [],
1284
  "tokenizers": {
1285
  "Latn": {
1286
- "full_object": "StanzaTokenizer(\"pcm\")",
1287
- "original_lang_name": "nigerian_pidgin",
1288
- "original_lang_code": "pcm",
1289
  "script": "Latn",
1290
- "class_name": "StanzaTokenizer"
1291
  }
1292
  },
1293
  "node_i": "3570",
@@ -1313,11 +1315,11 @@
1313
  "children": [],
1314
  "tokenizers": {
1315
  "Latn": {
1316
- "full_object": "StanzaTokenizer(\"pcm\")",
1317
- "original_lang_name": "nigerian_pidgin",
1318
- "original_lang_code": "pcm",
1319
  "script": "Latn",
1320
- "class_name": "StanzaTokenizer"
1321
  }
1322
  },
1323
  "node_i": "3572",
@@ -1363,15 +1365,17 @@
1363
  "children": [],
1364
  "tokenizers": {
1365
  "Latn": {
1366
- "full_object": "StanzaTokenizer(\"pcm\")",
1367
- "original_lang_name": "nigerian_pidgin",
1368
- "original_lang_code": "pcm",
1369
  "script": "Latn",
1370
- "class_name": "StanzaTokenizer"
1371
  }
1372
  },
1373
  "node_i": "3576",
1374
- "native_tokenizers": [],
 
 
1375
  "scripts": [
1376
  "Latn"
1377
  ]
@@ -1379,11 +1383,11 @@
1379
  ],
1380
  "tokenizers": {
1381
  "Latn": {
1382
- "full_object": "StanzaTokenizer(\"pcm\")",
1383
- "original_lang_name": "nigerian_pidgin",
1384
- "original_lang_code": "pcm",
1385
  "script": "Latn",
1386
- "class_name": "StanzaTokenizer"
1387
  }
1388
  },
1389
  "node_i": "3562",
@@ -1402,11 +1406,11 @@
1402
  "children": [],
1403
  "tokenizers": {
1404
  "Latn": {
1405
- "full_object": "StanzaTokenizer(\"pcm\")",
1406
- "original_lang_name": "nigerian_pidgin",
1407
- "original_lang_code": "pcm",
1408
  "script": "Latn",
1409
- "class_name": "StanzaTokenizer"
1410
  }
1411
  },
1412
  "node_i": "3578",
@@ -1428,11 +1432,11 @@
1428
  ],
1429
  "tokenizers": {
1430
  "Latn": {
1431
- "full_object": "StanzaTokenizer(\"pcm\")",
1432
- "original_lang_name": "nigerian_pidgin",
1433
- "original_lang_code": "pcm",
1434
  "script": "Latn",
1435
- "class_name": "StanzaTokenizer"
1436
  }
1437
  },
1438
  "node_i": "3577",
@@ -1471,11 +1475,11 @@
1471
  "children": [],
1472
  "tokenizers": {
1473
  "Latn": {
1474
- "full_object": "StanzaTokenizer(\"pcm\")",
1475
- "original_lang_name": "nigerian_pidgin",
1476
- "original_lang_code": "pcm",
1477
  "script": "Latn",
1478
- "class_name": "StanzaTokenizer"
1479
  }
1480
  },
1481
  "node_i": "3583",
@@ -1511,11 +1515,11 @@
1511
  "children": [],
1512
  "tokenizers": {
1513
  "Latn": {
1514
- "full_object": "StanzaTokenizer(\"pcm\")",
1515
- "original_lang_name": "nigerian_pidgin",
1516
- "original_lang_code": "pcm",
1517
  "script": "Latn",
1518
- "class_name": "StanzaTokenizer"
1519
  }
1520
  },
1521
  "node_i": "3586",
@@ -1551,11 +1555,11 @@
1551
  "children": [],
1552
  "tokenizers": {
1553
  "Latn": {
1554
- "full_object": "StanzaTokenizer(\"pcm\")",
1555
- "original_lang_name": "nigerian_pidgin",
1556
- "original_lang_code": "pcm",
1557
  "script": "Latn",
1558
- "class_name": "StanzaTokenizer"
1559
  }
1560
  },
1561
  "node_i": "3589",
@@ -1607,11 +1611,11 @@
1607
  ],
1608
  "tokenizers": {
1609
  "Latn": {
1610
- "full_object": "StanzaTokenizer(\"pcm\")",
1611
- "original_lang_name": "nigerian_pidgin",
1612
- "original_lang_code": "pcm",
1613
  "script": "Latn",
1614
- "class_name": "StanzaTokenizer"
1615
  }
1616
  },
1617
  "node_i": "3580",
@@ -1630,11 +1634,11 @@
1630
  "children": [],
1631
  "tokenizers": {
1632
  "Latn": {
1633
- "full_object": "StanzaTokenizer(\"pcm\")",
1634
- "original_lang_name": "nigerian_pidgin",
1635
- "original_lang_code": "pcm",
1636
  "script": "Latn",
1637
- "class_name": "StanzaTokenizer"
1638
  }
1639
  },
1640
  "node_i": "3595",
@@ -1656,11 +1660,11 @@
1656
  ],
1657
  "tokenizers": {
1658
  "Latn": {
1659
- "full_object": "StanzaTokenizer(\"pcm\")",
1660
- "original_lang_name": "nigerian_pidgin",
1661
- "original_lang_code": "pcm",
1662
  "script": "Latn",
1663
- "class_name": "StanzaTokenizer"
1664
  }
1665
  },
1666
  "node_i": "3594",
@@ -1700,11 +1704,11 @@
1700
  "children": [],
1701
  "tokenizers": {
1702
  "Latn": {
1703
- "full_object": "StanzaTokenizer(\"pcm\")",
1704
- "original_lang_name": "nigerian_pidgin",
1705
- "original_lang_code": "pcm",
1706
  "script": "Latn",
1707
- "class_name": "StanzaTokenizer"
1708
  }
1709
  },
1710
  "node_i": "3600",
@@ -1716,11 +1720,11 @@
1716
  ],
1717
  "tokenizers": {
1718
  "Latn": {
1719
- "full_object": "StanzaTokenizer(\"pcm\")",
1720
- "original_lang_name": "nigerian_pidgin",
1721
- "original_lang_code": "pcm",
1722
  "script": "Latn",
1723
- "class_name": "StanzaTokenizer"
1724
  }
1725
  },
1726
  "node_i": "3599",
@@ -1730,11 +1734,11 @@
1730
  ],
1731
  "tokenizers": {
1732
  "Latn": {
1733
- "full_object": "StanzaTokenizer(\"pcm\")",
1734
- "original_lang_name": "nigerian_pidgin",
1735
- "original_lang_code": "pcm",
1736
  "script": "Latn",
1737
- "class_name": "StanzaTokenizer"
1738
  }
1739
  },
1740
  "node_i": "3481",
 
833
  "children": [],
834
  "tokenizers": {
835
  "Latn": {
836
+ "full_object": "SpaCyTokenizer(\"ms\")",
837
+ "original_lang_name": "malay",
838
+ "original_lang_code": "msa",
839
  "script": "Latn",
840
+ "class_name": "SpaCyTokenizer"
841
  }
842
  },
843
  "node_i": "3540",
 
863
  "children": [],
864
  "tokenizers": {
865
  "Latn": {
866
+ "full_object": "SpaCyTokenizer(\"ms\")",
867
+ "original_lang_name": "malay",
868
+ "original_lang_code": "msa",
869
  "script": "Latn",
870
+ "class_name": "SpaCyTokenizer"
871
  }
872
  },
873
  "node_i": "3542",
 
883
  "children": [],
884
  "tokenizers": {
885
  "Latn": {
886
+ "full_object": "SpaCyTokenizer(\"ms\")",
887
+ "original_lang_name": "malay",
888
+ "original_lang_code": "msa",
889
  "script": "Latn",
890
+ "class_name": "SpaCyTokenizer"
891
  }
892
  },
893
  "node_i": "3543",
 
903
  "children": [],
904
  "tokenizers": {
905
  "Latn": {
906
+ "full_object": "SpaCyTokenizer(\"ms\")",
907
+ "original_lang_name": "malay",
908
+ "original_lang_code": "msa",
909
  "script": "Latn",
910
+ "class_name": "SpaCyTokenizer"
911
  }
912
  },
913
  "node_i": "3544",
 
923
  "children": [],
924
  "tokenizers": {
925
  "Latn": {
926
+ "full_object": "SpaCyTokenizer(\"ms\")",
927
+ "original_lang_name": "malay",
928
+ "original_lang_code": "msa",
929
  "script": "Latn",
930
+ "class_name": "SpaCyTokenizer"
931
  }
932
  },
933
  "node_i": "3545",
 
963
  "children": [],
964
  "tokenizers": {
965
  "Latn": {
966
+ "full_object": "SpaCyTokenizer(\"ms\")",
967
+ "original_lang_name": "malay",
968
+ "original_lang_code": "msa",
969
  "script": "Latn",
970
+ "class_name": "SpaCyTokenizer"
971
  }
972
  },
973
  "node_i": "3548",
 
983
  "children": [],
984
  "tokenizers": {
985
  "Latn": {
986
+ "full_object": "SpaCyTokenizer(\"ms\")",
987
+ "original_lang_name": "malay",
988
+ "original_lang_code": "msa",
989
  "script": "Latn",
990
+ "class_name": "SpaCyTokenizer"
991
  }
992
  },
993
  "node_i": "3549",
 
1009
  ],
1010
  "tokenizers": {
1011
  "Latn": {
1012
+ "full_object": "SpaCyTokenizer(\"ms\")",
1013
+ "original_lang_name": "malay",
1014
+ "original_lang_code": "msa",
1015
  "script": "Latn",
1016
+ "class_name": "SpaCyTokenizer"
1017
  }
1018
  },
1019
  "node_i": "3539",
 
1074
  "children": [],
1075
  "tokenizers": {
1076
  "Latn": {
1077
+ "full_object": "SpaCyTokenizer(\"ms\")",
1078
+ "original_lang_name": "malay",
1079
+ "original_lang_code": "msa",
1080
  "script": "Latn",
1081
+ "class_name": "SpaCyTokenizer"
1082
  }
1083
  },
1084
  "node_i": "3556",
 
1090
  ],
1091
  "tokenizers": {
1092
  "Latn": {
1093
+ "full_object": "SpaCyTokenizer(\"ms\")",
1094
+ "original_lang_name": "malay",
1095
+ "original_lang_code": "msa",
1096
  "script": "Latn",
1097
+ "class_name": "SpaCyTokenizer"
1098
  }
1099
  },
1100
  "node_i": "3555",
 
1134
  "children": [],
1135
  "tokenizers": {
1136
  "Latn": {
1137
+ "full_object": "SpaCyTokenizer(\"ms\")",
1138
+ "original_lang_name": "malay",
1139
+ "original_lang_code": "msa",
1140
  "script": "Latn",
1141
+ "class_name": "SpaCyTokenizer"
1142
  }
1143
  },
1144
  "node_i": "3560",
 
1160
  ],
1161
  "tokenizers": {
1162
  "Latn": {
1163
+ "full_object": "SpaCyTokenizer(\"ms\")",
1164
+ "original_lang_name": "malay",
1165
+ "original_lang_code": "msa",
1166
  "script": "Latn",
1167
+ "class_name": "SpaCyTokenizer"
1168
  }
1169
  },
1170
  "node_i": "3559",
 
1183
  "children": [],
1184
  "tokenizers": {
1185
  "Latn": {
1186
+ "full_object": "SpaCyTokenizer(\"ms\")",
1187
+ "original_lang_name": "malay",
1188
+ "original_lang_code": "msa",
1189
  "script": "Latn",
1190
+ "class_name": "SpaCyTokenizer"
1191
  }
1192
  },
1193
  "node_i": "3563",
 
1203
  "children": [],
1204
  "tokenizers": {
1205
  "Latn": {
1206
+ "full_object": "SpaCyTokenizer(\"ms\")",
1207
+ "original_lang_name": "malay",
1208
+ "original_lang_code": "msa",
1209
  "script": "Latn",
1210
+ "class_name": "SpaCyTokenizer"
1211
  }
1212
  },
1213
  "node_i": "3564",
 
1263
  "children": [],
1264
  "tokenizers": {
1265
  "Latn": {
1266
+ "full_object": "SpaCyTokenizer(\"ms\")",
1267
+ "original_lang_name": "malay",
1268
+ "original_lang_code": "msa",
1269
  "script": "Latn",
1270
+ "class_name": "SpaCyTokenizer"
1271
  }
1272
  },
1273
  "node_i": "3569",
1274
+ "native_tokenizers": [
1275
+ "Latn"
1276
+ ],
1277
  "scripts": [
1278
  "Latn"
1279
  ]
 
1285
  "children": [],
1286
  "tokenizers": {
1287
  "Latn": {
1288
+ "full_object": "SpaCyTokenizer(\"ms\")",
1289
+ "original_lang_name": "malay",
1290
+ "original_lang_code": "msa",
1291
  "script": "Latn",
1292
+ "class_name": "SpaCyTokenizer"
1293
  }
1294
  },
1295
  "node_i": "3570",
 
1315
  "children": [],
1316
  "tokenizers": {
1317
  "Latn": {
1318
+ "full_object": "SpaCyTokenizer(\"ms\")",
1319
+ "original_lang_name": "malay",
1320
+ "original_lang_code": "msa",
1321
  "script": "Latn",
1322
+ "class_name": "SpaCyTokenizer"
1323
  }
1324
  },
1325
  "node_i": "3572",
 
1365
  "children": [],
1366
  "tokenizers": {
1367
  "Latn": {
1368
+ "full_object": "SpaCyTokenizer(\"ms\")",
1369
+ "original_lang_name": "malay",
1370
+ "original_lang_code": "msa",
1371
  "script": "Latn",
1372
+ "class_name": "SpaCyTokenizer"
1373
  }
1374
  },
1375
  "node_i": "3576",
1376
+ "native_tokenizers": [
1377
+ "Latn"
1378
+ ],
1379
  "scripts": [
1380
  "Latn"
1381
  ]
 
1383
  ],
1384
  "tokenizers": {
1385
  "Latn": {
1386
+ "full_object": "SpaCyTokenizer(\"ms\")",
1387
+ "original_lang_name": "malay",
1388
+ "original_lang_code": "msa",
1389
  "script": "Latn",
1390
+ "class_name": "SpaCyTokenizer"
1391
  }
1392
  },
1393
  "node_i": "3562",
 
1406
  "children": [],
1407
  "tokenizers": {
1408
  "Latn": {
1409
+ "full_object": "SpaCyTokenizer(\"ms\")",
1410
+ "original_lang_name": "malay",
1411
+ "original_lang_code": "msa",
1412
  "script": "Latn",
1413
+ "class_name": "SpaCyTokenizer"
1414
  }
1415
  },
1416
  "node_i": "3578",
 
1432
  ],
1433
  "tokenizers": {
1434
  "Latn": {
1435
+ "full_object": "SpaCyTokenizer(\"ms\")",
1436
+ "original_lang_name": "malay",
1437
+ "original_lang_code": "msa",
1438
  "script": "Latn",
1439
+ "class_name": "SpaCyTokenizer"
1440
  }
1441
  },
1442
  "node_i": "3577",
 
1475
  "children": [],
1476
  "tokenizers": {
1477
  "Latn": {
1478
+ "full_object": "SpaCyTokenizer(\"ms\")",
1479
+ "original_lang_name": "malay",
1480
+ "original_lang_code": "msa",
1481
  "script": "Latn",
1482
+ "class_name": "SpaCyTokenizer"
1483
  }
1484
  },
1485
  "node_i": "3583",
 
1515
  "children": [],
1516
  "tokenizers": {
1517
  "Latn": {
1518
+ "full_object": "SpaCyTokenizer(\"ms\")",
1519
+ "original_lang_name": "malay",
1520
+ "original_lang_code": "msa",
1521
  "script": "Latn",
1522
+ "class_name": "SpaCyTokenizer"
1523
  }
1524
  },
1525
  "node_i": "3586",
 
1555
  "children": [],
1556
  "tokenizers": {
1557
  "Latn": {
1558
+ "full_object": "SpaCyTokenizer(\"ms\")",
1559
+ "original_lang_name": "malay",
1560
+ "original_lang_code": "msa",
1561
  "script": "Latn",
1562
+ "class_name": "SpaCyTokenizer"
1563
  }
1564
  },
1565
  "node_i": "3589",
 
1611
  ],
1612
  "tokenizers": {
1613
  "Latn": {
1614
+ "full_object": "SpaCyTokenizer(\"ms\")",
1615
+ "original_lang_name": "malay",
1616
+ "original_lang_code": "msa",
1617
  "script": "Latn",
1618
+ "class_name": "SpaCyTokenizer"
1619
  }
1620
  },
1621
  "node_i": "3580",
 
1634
  "children": [],
1635
  "tokenizers": {
1636
  "Latn": {
1637
+ "full_object": "SpaCyTokenizer(\"ms\")",
1638
+ "original_lang_name": "malay",
1639
+ "original_lang_code": "msa",
1640
  "script": "Latn",
1641
+ "class_name": "SpaCyTokenizer"
1642
  }
1643
  },
1644
  "node_i": "3595",
 
1660
  ],
1661
  "tokenizers": {
1662
  "Latn": {
1663
+ "full_object": "SpaCyTokenizer(\"ms\")",
1664
+ "original_lang_name": "malay",
1665
+ "original_lang_code": "msa",
1666
  "script": "Latn",
1667
+ "class_name": "SpaCyTokenizer"
1668
  }
1669
  },
1670
  "node_i": "3594",
 
1704
  "children": [],
1705
  "tokenizers": {
1706
  "Latn": {
1707
+ "full_object": "SpaCyTokenizer(\"ms\")",
1708
+ "original_lang_name": "malay",
1709
+ "original_lang_code": "msa",
1710
  "script": "Latn",
1711
+ "class_name": "SpaCyTokenizer"
1712
  }
1713
  },
1714
  "node_i": "3600",
 
1720
  ],
1721
  "tokenizers": {
1722
  "Latn": {
1723
+ "full_object": "SpaCyTokenizer(\"ms\")",
1724
+ "original_lang_name": "malay",
1725
+ "original_lang_code": "msa",
1726
  "script": "Latn",
1727
+ "class_name": "SpaCyTokenizer"
1728
  }
1729
  },
1730
  "node_i": "3599",
 
1734
  ],
1735
  "tokenizers": {
1736
  "Latn": {
1737
+ "full_object": "SpaCyTokenizer(\"ms\")",
1738
+ "original_lang_name": "malay",
1739
+ "original_lang_code": "msa",
1740
  "script": "Latn",
1741
+ "class_name": "SpaCyTokenizer"
1742
  }
1743
  },
1744
  "node_i": "3481",
data/Indo-European.json CHANGED
@@ -20,15 +20,17 @@
20
  "children": [],
21
  "tokenizers": {
22
  "Latn": {
23
- "full_object": "SpaCyTokenizer(\"en\")",
24
- "original_lang_name": "english",
25
- "original_lang_code": "eng",
26
  "script": "Latn",
27
  "class_name": "SpaCyTokenizer"
28
  }
29
  },
30
  "node_i": "3922",
31
- "native_tokenizers": [],
 
 
32
  "scripts": [
33
  "Latn"
34
  ]
@@ -36,9 +38,9 @@
36
  ],
37
  "tokenizers": {
38
  "Latn": {
39
- "full_object": "SpaCyTokenizer(\"en\")",
40
- "original_lang_name": "english",
41
- "original_lang_code": "eng",
42
  "script": "Latn",
43
  "class_name": "SpaCyTokenizer"
44
  }
@@ -79,15 +81,17 @@
79
  "children": [],
80
  "tokenizers": {
81
  "Latn": {
82
- "full_object": "SpaCyTokenizer(\"en\")",
83
- "original_lang_name": "english",
84
- "original_lang_code": "eng",
85
  "script": "Latn",
86
  "class_name": "SpaCyTokenizer"
87
  }
88
  },
89
  "node_i": "3926",
90
- "native_tokenizers": [],
 
 
91
  "scripts": [
92
  "Latn"
93
  ]
@@ -95,9 +99,9 @@
95
  ],
96
  "tokenizers": {
97
  "Latn": {
98
- "full_object": "SpaCyTokenizer(\"en\")",
99
- "original_lang_name": "english",
100
- "original_lang_code": "eng",
101
  "script": "Latn",
102
  "class_name": "SpaCyTokenizer"
103
  }
@@ -109,9 +113,9 @@
109
  ],
110
  "tokenizers": {
111
  "Latn": {
112
- "full_object": "SpaCyTokenizer(\"en\")",
113
- "original_lang_name": "english",
114
- "original_lang_code": "eng",
115
  "script": "Latn",
116
  "class_name": "SpaCyTokenizer"
117
  }
@@ -227,15 +231,17 @@
227
  "children": [],
228
  "tokenizers": {
229
  "Latn": {
230
- "full_object": "SpaCyTokenizer(\"lt\")",
231
- "original_lang_name": "lithuanian",
232
- "original_lang_code": "lit",
233
  "script": "Latn",
234
  "class_name": "SpaCyTokenizer"
235
  }
236
  },
237
  "node_i": "3934",
238
- "native_tokenizers": [],
 
 
239
  "scripts": [
240
  "Latn"
241
  ]
@@ -247,15 +253,17 @@
247
  "children": [],
248
  "tokenizers": {
249
  "Latn": {
250
- "full_object": "SpaCyTokenizer(\"lt\")",
251
- "original_lang_name": "lithuanian",
252
- "original_lang_code": "lit",
253
  "script": "Latn",
254
  "class_name": "SpaCyTokenizer"
255
  }
256
  },
257
  "node_i": "3935",
258
- "native_tokenizers": [],
 
 
259
  "scripts": [
260
  "Latn"
261
  ]
@@ -267,9 +275,9 @@
267
  "children": [],
268
  "tokenizers": {
269
  "Latn": {
270
- "full_object": "SpaCyTokenizer(\"lt\")",
271
- "original_lang_name": "lithuanian",
272
- "original_lang_code": "lit",
273
  "script": "Latn",
274
  "class_name": "SpaCyTokenizer"
275
  }
@@ -293,9 +301,9 @@
293
  ],
294
  "tokenizers": {
295
  "Latn": {
296
- "full_object": "SpaCyTokenizer(\"lt\")",
297
- "original_lang_name": "lithuanian",
298
- "original_lang_code": "lit",
299
  "script": "Latn",
300
  "class_name": "SpaCyTokenizer"
301
  }
@@ -316,9 +324,9 @@
316
  "children": [],
317
  "tokenizers": {
318
  "Latn": {
319
- "full_object": "SpaCyTokenizer(\"lt\")",
320
- "original_lang_name": "lithuanian",
321
- "original_lang_code": "lit",
322
  "script": "Latn",
323
  "class_name": "SpaCyTokenizer"
324
  }
@@ -342,9 +350,9 @@
342
  ],
343
  "tokenizers": {
344
  "Latn": {
345
- "full_object": "SpaCyTokenizer(\"lt\")",
346
- "original_lang_name": "lithuanian",
347
- "original_lang_code": "lit",
348
  "script": "Latn",
349
  "class_name": "SpaCyTokenizer"
350
  }
@@ -356,9 +364,9 @@
356
  ],
357
  "tokenizers": {
358
  "Latn": {
359
- "full_object": "SpaCyTokenizer(\"lt\")",
360
- "original_lang_name": "lithuanian",
361
- "original_lang_code": "lit",
362
  "script": "Latn",
363
  "class_name": "SpaCyTokenizer"
364
  }
@@ -589,15 +597,17 @@
589
  "children": [],
590
  "tokenizers": {
591
  "Latn": {
592
- "full_object": "SpaCyTokenizer(\"hr\")",
593
- "original_lang_name": "croatian",
594
- "original_lang_code": "hrv",
595
  "script": "Latn",
596
  "class_name": "SpaCyTokenizer"
597
  }
598
  },
599
  "node_i": "3954",
600
- "native_tokenizers": [],
 
 
601
  "scripts": [
602
  "Latn"
603
  ]
@@ -609,9 +619,9 @@
609
  "children": [],
610
  "tokenizers": {
611
  "Latn": {
612
- "full_object": "SpaCyTokenizer(\"hr\")",
613
- "original_lang_name": "croatian",
614
- "original_lang_code": "hrv",
615
  "script": "Latn",
616
  "class_name": "SpaCyTokenizer"
617
  }
@@ -629,15 +639,17 @@
629
  "children": [],
630
  "tokenizers": {
631
  "Latn": {
632
- "full_object": "SpaCyTokenizer(\"hr\")",
633
- "original_lang_name": "croatian",
634
- "original_lang_code": "hrv",
635
  "script": "Latn",
636
  "class_name": "SpaCyTokenizer"
637
  }
638
  },
639
  "node_i": "3956",
640
- "native_tokenizers": [],
 
 
641
  "scripts": [
642
  "Latn"
643
  ]
@@ -700,16 +712,17 @@
700
  "class_name": "SpaCyTokenizer"
701
  },
702
  "Latn": {
703
- "full_object": "SpaCyTokenizer(\"hr\")",
704
- "original_lang_name": "croatian",
705
- "original_lang_code": "hrv",
706
  "script": "Latn",
707
  "class_name": "SpaCyTokenizer"
708
  }
709
  },
710
  "node_i": "3959",
711
  "native_tokenizers": [
712
- "Cyrl"
 
713
  ],
714
  "scripts": [
715
  "Cyrl",
@@ -729,9 +742,9 @@
729
  ],
730
  "tokenizers": {
731
  "Latn": {
732
- "full_object": "SpaCyTokenizer(\"hr\")",
733
- "original_lang_name": "croatian",
734
- "original_lang_code": "hrv",
735
  "script": "Latn",
736
  "class_name": "SpaCyTokenizer"
737
  },
@@ -757,9 +770,9 @@
757
  "class_name": "SpaCyTokenizer"
758
  },
759
  "Latn": {
760
- "full_object": "SpaCyTokenizer(\"hr\")",
761
- "original_lang_name": "croatian",
762
- "original_lang_code": "hrv",
763
  "script": "Latn",
764
  "class_name": "SpaCyTokenizer"
765
  }
@@ -2694,9 +2707,9 @@
2694
  "children": [],
2695
  "tokenizers": {
2696
  "Deva": {
2697
- "full_object": "IndicNLPTokenizer(\"hi\")",
2698
- "original_lang_name": "hindi",
2699
- "original_lang_code": "hin",
2700
  "script": "Deva",
2701
  "class_name": "IndicNLPTokenizer"
2702
  }
@@ -2744,9 +2757,9 @@
2744
  "children": [],
2745
  "tokenizers": {
2746
  "Deva": {
2747
- "full_object": "IndicNLPTokenizer(\"hi\")",
2748
- "original_lang_name": "hindi",
2749
- "original_lang_code": "hin",
2750
  "script": "Deva",
2751
  "class_name": "IndicNLPTokenizer"
2752
  }
@@ -2780,9 +2793,9 @@
2780
  ],
2781
  "tokenizers": {
2782
  "Deva": {
2783
- "full_object": "IndicNLPTokenizer(\"hi\")",
2784
- "original_lang_name": "hindi",
2785
- "original_lang_code": "hin",
2786
  "script": "Deva",
2787
  "class_name": "IndicNLPTokenizer"
2788
  },
@@ -2810,15 +2823,17 @@
2810
  "children": [],
2811
  "tokenizers": {
2812
  "Deva": {
2813
- "full_object": "IndicNLPTokenizer(\"hi\")",
2814
- "original_lang_name": "hindi",
2815
- "original_lang_code": "hin",
2816
  "script": "Deva",
2817
  "class_name": "IndicNLPTokenizer"
2818
  }
2819
  },
2820
  "node_i": "4080",
2821
- "native_tokenizers": [],
 
 
2822
  "scripts": [
2823
  "Deva"
2824
  ]
@@ -2840,9 +2855,9 @@
2840
  "children": [],
2841
  "tokenizers": {
2842
  "Deva": {
2843
- "full_object": "IndicNLPTokenizer(\"hi\")",
2844
- "original_lang_name": "hindi",
2845
- "original_lang_code": "hin",
2846
  "script": "Deva",
2847
  "class_name": "IndicNLPTokenizer"
2848
  },
@@ -2855,7 +2870,9 @@
2855
  }
2856
  },
2857
  "node_i": "4082",
2858
- "native_tokenizers": [],
 
 
2859
  "scripts": [
2860
  "Latn",
2861
  "Deva"
@@ -2864,9 +2881,9 @@
2864
  ],
2865
  "tokenizers": {
2866
  "Deva": {
2867
- "full_object": "IndicNLPTokenizer(\"hi\")",
2868
- "original_lang_name": "hindi",
2869
- "original_lang_code": "hin",
2870
  "script": "Deva",
2871
  "class_name": "IndicNLPTokenizer"
2872
  },
@@ -2885,9 +2902,9 @@
2885
  ],
2886
  "tokenizers": {
2887
  "Deva": {
2888
- "full_object": "IndicNLPTokenizer(\"hi\")",
2889
- "original_lang_name": "hindi",
2890
- "original_lang_code": "hin",
2891
  "script": "Deva",
2892
  "class_name": "IndicNLPTokenizer"
2893
  },
@@ -3941,9 +3958,9 @@
3941
  "children": [],
3942
  "tokenizers": {
3943
  "Deva": {
3944
- "full_object": "IndicNLPTokenizer(\"hi\")",
3945
- "original_lang_name": "hindi",
3946
- "original_lang_code": "hin",
3947
  "script": "Deva",
3948
  "class_name": "IndicNLPTokenizer"
3949
  }
@@ -3967,9 +3984,9 @@
3967
  ],
3968
  "tokenizers": {
3969
  "Deva": {
3970
- "full_object": "IndicNLPTokenizer(\"hi\")",
3971
- "original_lang_name": "hindi",
3972
- "original_lang_code": "hin",
3973
  "script": "Deva",
3974
  "class_name": "IndicNLPTokenizer"
3975
  }
@@ -3981,9 +3998,9 @@
3981
  ],
3982
  "tokenizers": {
3983
  "Deva": {
3984
- "full_object": "IndicNLPTokenizer(\"hi\")",
3985
- "original_lang_name": "hindi",
3986
- "original_lang_code": "hin",
3987
  "script": "Deva",
3988
  "class_name": "IndicNLPTokenizer"
3989
  }
@@ -4295,9 +4312,9 @@
4295
  "class_name": "IndicNLPTokenizer"
4296
  },
4297
  "Deva": {
4298
- "full_object": "IndicNLPTokenizer(\"hi\")",
4299
- "original_lang_name": "hindi",
4300
- "original_lang_code": "hin",
4301
  "script": "Deva",
4302
  "class_name": "IndicNLPTokenizer"
4303
  },
@@ -4336,6 +4353,13 @@
4336
  }
4337
  ],
4338
  "tokenizers": {
 
 
 
 
 
 
 
4339
  "Gujr": {
4340
  "full_object": "IndicNLPTokenizer(\"gu\")",
4341
  "original_lang_name": "gujarati",
@@ -4350,13 +4374,6 @@
4350
  "script": "Guru",
4351
  "class_name": "IndicNLPTokenizer"
4352
  },
4353
- "Deva": {
4354
- "full_object": "IndicNLPTokenizer(\"hi\")",
4355
- "original_lang_name": "hindi",
4356
- "original_lang_code": "hin",
4357
- "script": "Deva",
4358
- "class_name": "IndicNLPTokenizer"
4359
- },
4360
  "Arab": {
4361
  "full_object": "IndicNLPTokenizer(\"ur\")",
4362
  "original_lang_name": "urdu",
@@ -4990,6 +5007,13 @@
4990
  "iso_3_code": "ory",
4991
  "children": [],
4992
  "tokenizers": {
 
 
 
 
 
 
 
4993
  "Latn": {
4994
  "full_object": "StanzaTokenizer(\"kmr\")",
4995
  "original_lang_name": "northern_kurdish",
@@ -4999,7 +5023,9 @@
4999
  }
5000
  },
5001
  "node_i": "4234",
5002
- "native_tokenizers": [],
 
 
5003
  "scripts": [
5004
  "Latn",
5005
  "Orya"
@@ -5027,6 +5053,13 @@
5027
  }
5028
  ],
5029
  "tokenizers": {
 
 
 
 
 
 
 
5030
  "Latn": {
5031
  "full_object": "StanzaTokenizer(\"kmr\")",
5032
  "original_lang_name": "northern_kurdish",
@@ -5114,6 +5147,13 @@
5114
  "script": "Deva",
5115
  "class_name": "IndicNLPTokenizer"
5116
  },
 
 
 
 
 
 
 
5117
  "Latn": {
5118
  "full_object": "StanzaTokenizer(\"kmr\")",
5119
  "original_lang_name": "northern_kurdish",
@@ -5756,9 +5796,9 @@
5756
  "children": [],
5757
  "tokenizers": {
5758
  "Deva": {
5759
- "full_object": "IndicNLPTokenizer(\"mr\")",
5760
- "original_lang_name": "marathi",
5761
- "original_lang_code": "mar",
5762
  "script": "Deva",
5763
  "class_name": "IndicNLPTokenizer"
5764
  },
@@ -5771,7 +5811,9 @@
5771
  }
5772
  },
5773
  "node_i": "4287",
5774
- "native_tokenizers": [],
 
 
5775
  "scripts": [
5776
  "Latn",
5777
  "Deva"
@@ -5784,9 +5826,9 @@
5784
  "children": [],
5785
  "tokenizers": {
5786
  "Deva": {
5787
- "full_object": "IndicNLPTokenizer(\"mr\")",
5788
- "original_lang_name": "marathi",
5789
- "original_lang_code": "mar",
5790
  "script": "Deva",
5791
  "class_name": "IndicNLPTokenizer"
5792
  }
@@ -5850,9 +5892,9 @@
5850
  ],
5851
  "tokenizers": {
5852
  "Deva": {
5853
- "full_object": "IndicNLPTokenizer(\"mr\")",
5854
- "original_lang_name": "marathi",
5855
- "original_lang_code": "mar",
5856
  "script": "Deva",
5857
  "class_name": "IndicNLPTokenizer"
5858
  },
@@ -6026,6 +6068,13 @@
6026
  "script": "Deva",
6027
  "class_name": "IndicNLPTokenizer"
6028
  },
 
 
 
 
 
 
 
6029
  "Arab": {
6030
  "full_object": "IndicNLPTokenizer(\"ur\")",
6031
  "original_lang_name": "sindhi",
@@ -6570,6 +6619,13 @@
6570
  "script": "Beng",
6571
  "class_name": "IndicNLPTokenizer"
6572
  },
 
 
 
 
 
 
 
6573
  "Arab": {
6574
  "full_object": "IndicNLPTokenizer(\"ur\")",
6575
  "original_lang_name": "urdu",
@@ -8215,6 +8271,13 @@
8215
  "script": "Beng",
8216
  "class_name": "IndicNLPTokenizer"
8217
  },
 
 
 
 
 
 
 
8218
  "Arab": {
8219
  "full_object": "IndicNLPTokenizer(\"ur\")",
8220
  "original_lang_name": "urdu",
@@ -9568,13 +9631,6 @@
9568
  }
9569
  ],
9570
  "tokenizers": {
9571
- "Armn": {
9572
- "full_object": "SpaCyTokenizer(\"hy\")",
9573
- "original_lang_name": "armenian",
9574
- "original_lang_code": "hye",
9575
- "script": "Armn",
9576
- "class_name": "SpaCyTokenizer"
9577
- },
9578
  "Latn": {
9579
  "full_object": "SpaCyTokenizer(\"en\")",
9580
  "original_lang_name": "english",
@@ -9582,6 +9638,13 @@
9582
  "script": "Latn",
9583
  "class_name": "SpaCyTokenizer"
9584
  },
 
 
 
 
 
 
 
9585
  "Cyrl": {
9586
  "full_object": "SpaCyTokenizer(\"ru\")",
9587
  "original_lang_name": "russian",
@@ -9624,6 +9687,13 @@
9624
  "script": "Beng",
9625
  "class_name": "IndicNLPTokenizer"
9626
  },
 
 
 
 
 
 
 
9627
  "Arab": {
9628
  "full_object": "IndicNLPTokenizer(\"ur\")",
9629
  "original_lang_name": "urdu",
 
20
  "children": [],
21
  "tokenizers": {
22
  "Latn": {
23
+ "full_object": "SpaCyTokenizer(\"sq\")",
24
+ "original_lang_name": "albanian",
25
+ "original_lang_code": "sqi",
26
  "script": "Latn",
27
  "class_name": "SpaCyTokenizer"
28
  }
29
  },
30
  "node_i": "3922",
31
+ "native_tokenizers": [
32
+ "Latn"
33
+ ],
34
  "scripts": [
35
  "Latn"
36
  ]
 
38
  ],
39
  "tokenizers": {
40
  "Latn": {
41
+ "full_object": "SpaCyTokenizer(\"sq\")",
42
+ "original_lang_name": "albanian",
43
+ "original_lang_code": "sqi",
44
  "script": "Latn",
45
  "class_name": "SpaCyTokenizer"
46
  }
 
81
  "children": [],
82
  "tokenizers": {
83
  "Latn": {
84
+ "full_object": "SpaCyTokenizer(\"sq\")",
85
+ "original_lang_name": "albanian",
86
+ "original_lang_code": "sqi",
87
  "script": "Latn",
88
  "class_name": "SpaCyTokenizer"
89
  }
90
  },
91
  "node_i": "3926",
92
+ "native_tokenizers": [
93
+ "Latn"
94
+ ],
95
  "scripts": [
96
  "Latn"
97
  ]
 
99
  ],
100
  "tokenizers": {
101
  "Latn": {
102
+ "full_object": "SpaCyTokenizer(\"sq\")",
103
+ "original_lang_name": "albanian",
104
+ "original_lang_code": "sqi",
105
  "script": "Latn",
106
  "class_name": "SpaCyTokenizer"
107
  }
 
113
  ],
114
  "tokenizers": {
115
  "Latn": {
116
+ "full_object": "SpaCyTokenizer(\"sq\")",
117
+ "original_lang_name": "albanian",
118
+ "original_lang_code": "sqi",
119
  "script": "Latn",
120
  "class_name": "SpaCyTokenizer"
121
  }
 
231
  "children": [],
232
  "tokenizers": {
233
  "Latn": {
234
+ "full_object": "SpaCyTokenizer(\"lv\")",
235
+ "original_lang_name": "latvian",
236
+ "original_lang_code": "lav",
237
  "script": "Latn",
238
  "class_name": "SpaCyTokenizer"
239
  }
240
  },
241
  "node_i": "3934",
242
+ "native_tokenizers": [
243
+ "Latn"
244
+ ],
245
  "scripts": [
246
  "Latn"
247
  ]
 
253
  "children": [],
254
  "tokenizers": {
255
  "Latn": {
256
+ "full_object": "SpaCyTokenizer(\"lv\")",
257
+ "original_lang_name": "latvian",
258
+ "original_lang_code": "lav",
259
  "script": "Latn",
260
  "class_name": "SpaCyTokenizer"
261
  }
262
  },
263
  "node_i": "3935",
264
+ "native_tokenizers": [
265
+ "Latn"
266
+ ],
267
  "scripts": [
268
  "Latn"
269
  ]
 
275
  "children": [],
276
  "tokenizers": {
277
  "Latn": {
278
+ "full_object": "SpaCyTokenizer(\"lv\")",
279
+ "original_lang_name": "latvian",
280
+ "original_lang_code": "lav",
281
  "script": "Latn",
282
  "class_name": "SpaCyTokenizer"
283
  }
 
301
  ],
302
  "tokenizers": {
303
  "Latn": {
304
+ "full_object": "SpaCyTokenizer(\"lv\")",
305
+ "original_lang_name": "latvian",
306
+ "original_lang_code": "lav",
307
  "script": "Latn",
308
  "class_name": "SpaCyTokenizer"
309
  }
 
324
  "children": [],
325
  "tokenizers": {
326
  "Latn": {
327
+ "full_object": "SpaCyTokenizer(\"lv\")",
328
+ "original_lang_name": "latvian",
329
+ "original_lang_code": "lav",
330
  "script": "Latn",
331
  "class_name": "SpaCyTokenizer"
332
  }
 
350
  ],
351
  "tokenizers": {
352
  "Latn": {
353
+ "full_object": "SpaCyTokenizer(\"lv\")",
354
+ "original_lang_name": "latvian",
355
+ "original_lang_code": "lav",
356
  "script": "Latn",
357
  "class_name": "SpaCyTokenizer"
358
  }
 
364
  ],
365
  "tokenizers": {
366
  "Latn": {
367
+ "full_object": "SpaCyTokenizer(\"lv\")",
368
+ "original_lang_name": "latvian",
369
+ "original_lang_code": "lav",
370
  "script": "Latn",
371
  "class_name": "SpaCyTokenizer"
372
  }
 
597
  "children": [],
598
  "tokenizers": {
599
  "Latn": {
600
+ "full_object": "SpaCyTokenizer(\"sr\")",
601
+ "original_lang_name": "serbocroatian",
602
+ "original_lang_code": "hbs",
603
  "script": "Latn",
604
  "class_name": "SpaCyTokenizer"
605
  }
606
  },
607
  "node_i": "3954",
608
+ "native_tokenizers": [
609
+ "Latn"
610
+ ],
611
  "scripts": [
612
  "Latn"
613
  ]
 
619
  "children": [],
620
  "tokenizers": {
621
  "Latn": {
622
+ "full_object": "SpaCyTokenizer(\"sr\")",
623
+ "original_lang_name": "serbocroatian",
624
+ "original_lang_code": "hbs",
625
  "script": "Latn",
626
  "class_name": "SpaCyTokenizer"
627
  }
 
639
  "children": [],
640
  "tokenizers": {
641
  "Latn": {
642
+ "full_object": "SpaCyTokenizer(\"sr\")",
643
+ "original_lang_name": "serbocroatian",
644
+ "original_lang_code": "hbs",
645
  "script": "Latn",
646
  "class_name": "SpaCyTokenizer"
647
  }
648
  },
649
  "node_i": "3956",
650
+ "native_tokenizers": [
651
+ "Latn"
652
+ ],
653
  "scripts": [
654
  "Latn"
655
  ]
 
712
  "class_name": "SpaCyTokenizer"
713
  },
714
  "Latn": {
715
+ "full_object": "SpaCyTokenizer(\"sr\")",
716
+ "original_lang_name": "serbocroatian",
717
+ "original_lang_code": "hbs",
718
  "script": "Latn",
719
  "class_name": "SpaCyTokenizer"
720
  }
721
  },
722
  "node_i": "3959",
723
  "native_tokenizers": [
724
+ "Cyrl",
725
+ "Latn"
726
  ],
727
  "scripts": [
728
  "Cyrl",
 
742
  ],
743
  "tokenizers": {
744
  "Latn": {
745
+ "full_object": "SpaCyTokenizer(\"sr\")",
746
+ "original_lang_name": "serbocroatian",
747
+ "original_lang_code": "hbs",
748
  "script": "Latn",
749
  "class_name": "SpaCyTokenizer"
750
  },
 
770
  "class_name": "SpaCyTokenizer"
771
  },
772
  "Latn": {
773
+ "full_object": "SpaCyTokenizer(\"sr\")",
774
+ "original_lang_name": "serbocroatian",
775
+ "original_lang_code": "hbs",
776
  "script": "Latn",
777
  "class_name": "SpaCyTokenizer"
778
  }
 
2707
  "children": [],
2708
  "tokenizers": {
2709
  "Deva": {
2710
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2711
+ "original_lang_name": "nepali",
2712
+ "original_lang_code": "nep",
2713
  "script": "Deva",
2714
  "class_name": "IndicNLPTokenizer"
2715
  }
 
2757
  "children": [],
2758
  "tokenizers": {
2759
  "Deva": {
2760
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2761
+ "original_lang_name": "nepali",
2762
+ "original_lang_code": "nep",
2763
  "script": "Deva",
2764
  "class_name": "IndicNLPTokenizer"
2765
  }
 
2793
  ],
2794
  "tokenizers": {
2795
  "Deva": {
2796
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2797
+ "original_lang_name": "nepali",
2798
+ "original_lang_code": "nep",
2799
  "script": "Deva",
2800
  "class_name": "IndicNLPTokenizer"
2801
  },
 
2823
  "children": [],
2824
  "tokenizers": {
2825
  "Deva": {
2826
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2827
+ "original_lang_name": "nepali",
2828
+ "original_lang_code": "nep",
2829
  "script": "Deva",
2830
  "class_name": "IndicNLPTokenizer"
2831
  }
2832
  },
2833
  "node_i": "4080",
2834
+ "native_tokenizers": [
2835
+ "Deva"
2836
+ ],
2837
  "scripts": [
2838
  "Deva"
2839
  ]
 
2855
  "children": [],
2856
  "tokenizers": {
2857
  "Deva": {
2858
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2859
+ "original_lang_name": "nepali",
2860
+ "original_lang_code": "nep",
2861
  "script": "Deva",
2862
  "class_name": "IndicNLPTokenizer"
2863
  },
 
2870
  }
2871
  },
2872
  "node_i": "4082",
2873
+ "native_tokenizers": [
2874
+ "Deva"
2875
+ ],
2876
  "scripts": [
2877
  "Latn",
2878
  "Deva"
 
2881
  ],
2882
  "tokenizers": {
2883
  "Deva": {
2884
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2885
+ "original_lang_name": "nepali",
2886
+ "original_lang_code": "nep",
2887
  "script": "Deva",
2888
  "class_name": "IndicNLPTokenizer"
2889
  },
 
2902
  ],
2903
  "tokenizers": {
2904
  "Deva": {
2905
+ "full_object": "IndicNLPTokenizer(\"ne\")",
2906
+ "original_lang_name": "nepali",
2907
+ "original_lang_code": "nep",
2908
  "script": "Deva",
2909
  "class_name": "IndicNLPTokenizer"
2910
  },
 
3958
  "children": [],
3959
  "tokenizers": {
3960
  "Deva": {
3961
+ "full_object": "IndicNLPTokenizer(\"ne\")",
3962
+ "original_lang_name": "nepali",
3963
+ "original_lang_code": "nep",
3964
  "script": "Deva",
3965
  "class_name": "IndicNLPTokenizer"
3966
  }
 
3984
  ],
3985
  "tokenizers": {
3986
  "Deva": {
3987
+ "full_object": "IndicNLPTokenizer(\"ne\")",
3988
+ "original_lang_name": "nepali",
3989
+ "original_lang_code": "nep",
3990
  "script": "Deva",
3991
  "class_name": "IndicNLPTokenizer"
3992
  }
 
3998
  ],
3999
  "tokenizers": {
4000
  "Deva": {
4001
+ "full_object": "IndicNLPTokenizer(\"ne\")",
4002
+ "original_lang_name": "nepali",
4003
+ "original_lang_code": "nep",
4004
  "script": "Deva",
4005
  "class_name": "IndicNLPTokenizer"
4006
  }
 
4312
  "class_name": "IndicNLPTokenizer"
4313
  },
4314
  "Deva": {
4315
+ "full_object": "IndicNLPTokenizer(\"ne\")",
4316
+ "original_lang_name": "nepali",
4317
+ "original_lang_code": "nep",
4318
  "script": "Deva",
4319
  "class_name": "IndicNLPTokenizer"
4320
  },
 
4353
  }
4354
  ],
4355
  "tokenizers": {
4356
+ "Deva": {
4357
+ "full_object": "IndicNLPTokenizer(\"ne\")",
4358
+ "original_lang_name": "nepali",
4359
+ "original_lang_code": "nep",
4360
+ "script": "Deva",
4361
+ "class_name": "IndicNLPTokenizer"
4362
+ },
4363
  "Gujr": {
4364
  "full_object": "IndicNLPTokenizer(\"gu\")",
4365
  "original_lang_name": "gujarati",
 
4374
  "script": "Guru",
4375
  "class_name": "IndicNLPTokenizer"
4376
  },
 
 
 
 
 
 
 
4377
  "Arab": {
4378
  "full_object": "IndicNLPTokenizer(\"ur\")",
4379
  "original_lang_name": "urdu",
 
5007
  "iso_3_code": "ory",
5008
  "children": [],
5009
  "tokenizers": {
5010
+ "Orya": {
5011
+ "full_object": "IndicNLPTokenizer(\"or\")",
5012
+ "original_lang_name": "oriya",
5013
+ "original_lang_code": "ori",
5014
+ "script": "Orya",
5015
+ "class_name": "IndicNLPTokenizer"
5016
+ },
5017
  "Latn": {
5018
  "full_object": "StanzaTokenizer(\"kmr\")",
5019
  "original_lang_name": "northern_kurdish",
 
5023
  }
5024
  },
5025
  "node_i": "4234",
5026
+ "native_tokenizers": [
5027
+ "Orya"
5028
+ ],
5029
  "scripts": [
5030
  "Latn",
5031
  "Orya"
 
5053
  }
5054
  ],
5055
  "tokenizers": {
5056
+ "Orya": {
5057
+ "full_object": "IndicNLPTokenizer(\"or\")",
5058
+ "original_lang_name": "oriya",
5059
+ "original_lang_code": "ori",
5060
+ "script": "Orya",
5061
+ "class_name": "IndicNLPTokenizer"
5062
+ },
5063
  "Latn": {
5064
  "full_object": "StanzaTokenizer(\"kmr\")",
5065
  "original_lang_name": "northern_kurdish",
 
5147
  "script": "Deva",
5148
  "class_name": "IndicNLPTokenizer"
5149
  },
5150
+ "Orya": {
5151
+ "full_object": "IndicNLPTokenizer(\"or\")",
5152
+ "original_lang_name": "oriya",
5153
+ "original_lang_code": "ori",
5154
+ "script": "Orya",
5155
+ "class_name": "IndicNLPTokenizer"
5156
+ },
5157
  "Latn": {
5158
  "full_object": "StanzaTokenizer(\"kmr\")",
5159
  "original_lang_name": "northern_kurdish",
 
5796
  "children": [],
5797
  "tokenizers": {
5798
  "Deva": {
5799
+ "full_object": "IndicNLPTokenizer(\"kK\")",
5800
+ "original_lang_name": "konkani",
5801
+ "original_lang_code": "kok",
5802
  "script": "Deva",
5803
  "class_name": "IndicNLPTokenizer"
5804
  },
 
5811
  }
5812
  },
5813
  "node_i": "4287",
5814
+ "native_tokenizers": [
5815
+ "Deva"
5816
+ ],
5817
  "scripts": [
5818
  "Latn",
5819
  "Deva"
 
5826
  "children": [],
5827
  "tokenizers": {
5828
  "Deva": {
5829
+ "full_object": "IndicNLPTokenizer(\"kK\")",
5830
+ "original_lang_name": "konkani",
5831
+ "original_lang_code": "kok",
5832
  "script": "Deva",
5833
  "class_name": "IndicNLPTokenizer"
5834
  }
 
5892
  ],
5893
  "tokenizers": {
5894
  "Deva": {
5895
+ "full_object": "IndicNLPTokenizer(\"kK\")",
5896
+ "original_lang_name": "konkani",
5897
+ "original_lang_code": "kok",
5898
  "script": "Deva",
5899
  "class_name": "IndicNLPTokenizer"
5900
  },
 
6068
  "script": "Deva",
6069
  "class_name": "IndicNLPTokenizer"
6070
  },
6071
+ "Orya": {
6072
+ "full_object": "IndicNLPTokenizer(\"or\")",
6073
+ "original_lang_name": "oriya",
6074
+ "original_lang_code": "ori",
6075
+ "script": "Orya",
6076
+ "class_name": "IndicNLPTokenizer"
6077
+ },
6078
  "Arab": {
6079
  "full_object": "IndicNLPTokenizer(\"ur\")",
6080
  "original_lang_name": "sindhi",
 
6619
  "script": "Beng",
6620
  "class_name": "IndicNLPTokenizer"
6621
  },
6622
+ "Orya": {
6623
+ "full_object": "IndicNLPTokenizer(\"or\")",
6624
+ "original_lang_name": "oriya",
6625
+ "original_lang_code": "ori",
6626
+ "script": "Orya",
6627
+ "class_name": "IndicNLPTokenizer"
6628
+ },
6629
  "Arab": {
6630
  "full_object": "IndicNLPTokenizer(\"ur\")",
6631
  "original_lang_name": "urdu",
 
8271
  "script": "Beng",
8272
  "class_name": "IndicNLPTokenizer"
8273
  },
8274
+ "Orya": {
8275
+ "full_object": "IndicNLPTokenizer(\"or\")",
8276
+ "original_lang_name": "oriya",
8277
+ "original_lang_code": "ori",
8278
+ "script": "Orya",
8279
+ "class_name": "IndicNLPTokenizer"
8280
+ },
8281
  "Arab": {
8282
  "full_object": "IndicNLPTokenizer(\"ur\")",
8283
  "original_lang_name": "urdu",
 
9631
  }
9632
  ],
9633
  "tokenizers": {
 
 
 
 
 
 
 
9634
  "Latn": {
9635
  "full_object": "SpaCyTokenizer(\"en\")",
9636
  "original_lang_name": "english",
 
9638
  "script": "Latn",
9639
  "class_name": "SpaCyTokenizer"
9640
  },
9641
+ "Armn": {
9642
+ "full_object": "SpaCyTokenizer(\"hy\")",
9643
+ "original_lang_name": "armenian",
9644
+ "original_lang_code": "hye",
9645
+ "script": "Armn",
9646
+ "class_name": "SpaCyTokenizer"
9647
+ },
9648
  "Cyrl": {
9649
  "full_object": "SpaCyTokenizer(\"ru\")",
9650
  "original_lang_name": "russian",
 
9687
  "script": "Beng",
9688
  "class_name": "IndicNLPTokenizer"
9689
  },
9690
+ "Orya": {
9691
+ "full_object": "IndicNLPTokenizer(\"or\")",
9692
+ "original_lang_name": "oriya",
9693
+ "original_lang_code": "ori",
9694
+ "script": "Orya",
9695
+ "class_name": "IndicNLPTokenizer"
9696
+ },
9697
  "Arab": {
9698
  "full_object": "IndicNLPTokenizer(\"ur\")",
9699
  "original_lang_name": "urdu",
data/Sino-Tibetan.json CHANGED
@@ -35,15 +35,17 @@
35
  "children": [],
36
  "tokenizers": {
37
  "Hani": {
38
- "full_object": "StanzaTokenizer(\"lzh\")",
39
- "original_lang_name": "literary_chinese",
40
- "original_lang_code": "lzh",
41
  "script": "Hani",
42
- "class_name": "StanzaTokenizer"
43
  }
44
  },
45
  "node_i": "8922",
46
- "native_tokenizers": [],
 
 
47
  "scripts": [
48
  "Hani"
49
  ]
@@ -127,15 +129,17 @@
127
  "children": [],
128
  "tokenizers": {
129
  "Hani": {
130
- "full_object": "StanzaTokenizer(\"lzh\")",
131
- "original_lang_name": "literary_chinese",
132
- "original_lang_code": "lzh",
133
  "script": "Hani",
134
- "class_name": "StanzaTokenizer"
135
  }
136
  },
137
  "node_i": "8930",
138
- "native_tokenizers": [],
 
 
139
  "scripts": [
140
  "Latn",
141
  "Hani"
@@ -200,15 +204,17 @@
200
  "children": [],
201
  "tokenizers": {
202
  "Hani": {
203
- "full_object": "StanzaTokenizer(\"lzh\")",
204
- "original_lang_name": "literary_chinese",
205
- "original_lang_code": "lzh",
206
  "script": "Hani",
207
- "class_name": "StanzaTokenizer"
208
  }
209
  },
210
  "node_i": "8935",
211
- "native_tokenizers": [],
 
 
212
  "scripts": [
213
  "Hani"
214
  ]
@@ -220,15 +226,17 @@
220
  "children": [],
221
  "tokenizers": {
222
  "Hani": {
223
- "full_object": "StanzaTokenizer(\"lzh\")",
224
- "original_lang_name": "literary_chinese",
225
- "original_lang_code": "lzh",
226
  "script": "Hani",
227
- "class_name": "StanzaTokenizer"
228
  }
229
  },
230
  "node_i": "8936",
231
- "native_tokenizers": [],
 
 
232
  "scripts": [
233
  "Hani"
234
  ]
@@ -236,11 +244,11 @@
236
  ],
237
  "tokenizers": {
238
  "Hani": {
239
- "full_object": "StanzaTokenizer(\"lzh\")",
240
- "original_lang_name": "literary_chinese",
241
- "original_lang_code": "lzh",
242
  "script": "Hani",
243
- "class_name": "StanzaTokenizer"
244
  }
245
  },
246
  "node_i": "8919",
@@ -6211,11 +6219,11 @@
6211
  ],
6212
  "tokenizers": {
6213
  "Hani": {
6214
- "full_object": "StanzaTokenizer(\"lzh\")",
6215
- "original_lang_name": "literary_chinese",
6216
- "original_lang_code": "lzh",
6217
  "script": "Hani",
6218
- "class_name": "StanzaTokenizer"
6219
  },
6220
  "Deva": {
6221
  "full_object": "IndicNLPTokenizer(\"hi\")",
 
35
  "children": [],
36
  "tokenizers": {
37
  "Hani": {
38
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
39
+ "original_lang_name": "chinese",
40
+ "original_lang_code": "zho",
41
  "script": "Hani",
42
+ "class_name": "SpaCyTokenizer"
43
  }
44
  },
45
  "node_i": "8922",
46
+ "native_tokenizers": [
47
+ "Hani"
48
+ ],
49
  "scripts": [
50
  "Hani"
51
  ]
 
129
  "children": [],
130
  "tokenizers": {
131
  "Hani": {
132
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
133
+ "original_lang_name": "chinese",
134
+ "original_lang_code": "zho",
135
  "script": "Hani",
136
+ "class_name": "SpaCyTokenizer"
137
  }
138
  },
139
  "node_i": "8930",
140
+ "native_tokenizers": [
141
+ "Hani"
142
+ ],
143
  "scripts": [
144
  "Latn",
145
  "Hani"
 
204
  "children": [],
205
  "tokenizers": {
206
  "Hani": {
207
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
208
+ "original_lang_name": "chinese",
209
+ "original_lang_code": "zho",
210
  "script": "Hani",
211
+ "class_name": "SpaCyTokenizer"
212
  }
213
  },
214
  "node_i": "8935",
215
+ "native_tokenizers": [
216
+ "Hani"
217
+ ],
218
  "scripts": [
219
  "Hani"
220
  ]
 
226
  "children": [],
227
  "tokenizers": {
228
  "Hani": {
229
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
230
+ "original_lang_name": "chinese",
231
+ "original_lang_code": "zho",
232
  "script": "Hani",
233
+ "class_name": "SpaCyTokenizer"
234
  }
235
  },
236
  "node_i": "8936",
237
+ "native_tokenizers": [
238
+ "Hani"
239
+ ],
240
  "scripts": [
241
  "Hani"
242
  ]
 
244
  ],
245
  "tokenizers": {
246
  "Hani": {
247
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
248
+ "original_lang_name": "chinese",
249
+ "original_lang_code": "zho",
250
  "script": "Hani",
251
+ "class_name": "SpaCyTokenizer"
252
  }
253
  },
254
  "node_i": "8919",
 
6219
  ],
6220
  "tokenizers": {
6221
  "Hani": {
6222
+ "full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
6223
+ "original_lang_name": "chinese",
6224
+ "original_lang_code": "zho",
6225
  "script": "Hani",
6226
+ "class_name": "SpaCyTokenizer"
6227
  },
6228
  "Deva": {
6229
  "full_object": "IndicNLPTokenizer(\"hi\")",
data/Turkic.json CHANGED
@@ -455,9 +455,9 @@
455
  "children": [],
456
  "tokenizers": {
457
  "Latn": {
458
- "full_object": "SpaCyTokenizer(\"tr\")",
459
- "original_lang_name": "turkish",
460
- "original_lang_code": "tur",
461
  "script": "Latn",
462
  "class_name": "SpaCyTokenizer"
463
  },
@@ -470,7 +470,9 @@
470
  }
471
  },
472
  "node_i": "10584",
473
- "native_tokenizers": [],
 
 
474
  "scripts": [
475
  "Latn",
476
  "Cyrl"
@@ -479,9 +481,9 @@
479
  ],
480
  "tokenizers": {
481
  "Latn": {
482
- "full_object": "SpaCyTokenizer(\"tr\")",
483
- "original_lang_name": "turkish",
484
- "original_lang_code": "tur",
485
  "script": "Latn",
486
  "class_name": "SpaCyTokenizer"
487
  },
 
455
  "children": [],
456
  "tokenizers": {
457
  "Latn": {
458
+ "full_object": "SpaCyTokenizer(\"az\")",
459
+ "original_lang_name": "azerbaijani",
460
+ "original_lang_code": "aze",
461
  "script": "Latn",
462
  "class_name": "SpaCyTokenizer"
463
  },
 
470
  }
471
  },
472
  "node_i": "10584",
473
+ "native_tokenizers": [
474
+ "Latn"
475
+ ],
476
  "scripts": [
477
  "Latn",
478
  "Cyrl"
 
481
  ],
482
  "tokenizers": {
483
  "Latn": {
484
+ "full_object": "SpaCyTokenizer(\"az\")",
485
+ "original_lang_name": "azerbaijani",
486
+ "original_lang_code": "aze",
487
  "script": "Latn",
488
  "class_name": "SpaCyTokenizer"
489
  },
data/Uralic.json CHANGED
@@ -77,15 +77,17 @@
77
  "children": [],
78
  "tokenizers": {
79
  "Latn": {
80
- "full_object": "SpaCyTokenizer(\"fi\")",
81
- "original_lang_name": "finnish",
82
- "original_lang_code": "fin",
83
  "script": "Latn",
84
  "class_name": "SpaCyTokenizer"
85
  }
86
  },
87
  "node_i": "10680",
88
- "native_tokenizers": [],
 
 
89
  "scripts": [
90
  "Latn"
91
  ]
@@ -289,15 +291,17 @@
289
  "children": [],
290
  "tokenizers": {
291
  "Latn": {
292
- "full_object": "SpaCyTokenizer(\"fi\")",
293
- "original_lang_name": "finnish",
294
- "original_lang_code": "fin",
295
  "script": "Latn",
296
  "class_name": "SpaCyTokenizer"
297
  }
298
  },
299
  "node_i": "10691",
300
- "native_tokenizers": [],
 
 
301
  "scripts": [
302
  "Latn"
303
  ]
 
77
  "children": [],
78
  "tokenizers": {
79
  "Latn": {
80
+ "full_object": "SpaCyTokenizer(\"et\")",
81
+ "original_lang_name": "estonian",
82
+ "original_lang_code": "est",
83
  "script": "Latn",
84
  "class_name": "SpaCyTokenizer"
85
  }
86
  },
87
  "node_i": "10680",
88
+ "native_tokenizers": [
89
+ "Latn"
90
+ ],
91
  "scripts": [
92
  "Latn"
93
  ]
 
291
  "children": [],
292
  "tokenizers": {
293
  "Latn": {
294
+ "full_object": "SpaCyTokenizer(\"et\")",
295
+ "original_lang_name": "estonian",
296
+ "original_lang_code": "est",
297
  "script": "Latn",
298
  "class_name": "SpaCyTokenizer"
299
  }
300
  },
301
  "node_i": "10691",
302
+ "native_tokenizers": [
303
+ "Latn"
304
+ ],
305
  "scripts": [
306
  "Latn"
307
  ]