macrolanguages fix
Browse files- data/Afro-Asiatic.json +153 -19
- data/Austronesian.json +0 -0
- data/Creole.json +126 -122
- data/Indo-European.json +191 -121
- data/Sino-Tibetan.json +36 -28
- data/Turkic.json +9 -7
- data/Uralic.json +12 -8
data/Afro-Asiatic.json
CHANGED
@@ -6035,9 +6035,19 @@
|
|
6035 |
"iso_1_code": "ar",
|
6036 |
"iso_3_code": "acm",
|
6037 |
"children": [],
|
6038 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6039 |
"node_i": "487",
|
6040 |
-
"native_tokenizers": [
|
|
|
|
|
6041 |
"scripts": [
|
6042 |
"Arab"
|
6043 |
]
|
@@ -6097,9 +6107,19 @@
|
|
6097 |
"iso_1_code": "ar",
|
6098 |
"iso_3_code": "aeb",
|
6099 |
"children": [],
|
6100 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6101 |
"node_i": "493",
|
6102 |
-
"native_tokenizers": [
|
|
|
|
|
6103 |
"scripts": [
|
6104 |
"Arab"
|
6105 |
]
|
@@ -6139,9 +6159,19 @@
|
|
6139 |
"iso_1_code": "ar",
|
6140 |
"iso_3_code": "apc",
|
6141 |
"children": [],
|
6142 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6143 |
"node_i": "497",
|
6144 |
-
"native_tokenizers": [
|
|
|
|
|
6145 |
"scripts": [
|
6146 |
"Arab"
|
6147 |
]
|
@@ -6162,6 +6192,13 @@
|
|
6162 |
"iso_3_code": "arb",
|
6163 |
"children": [],
|
6164 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6165 |
"Latn": {
|
6166 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6167 |
"original_lang_name": "maltese",
|
@@ -6171,7 +6208,9 @@
|
|
6171 |
}
|
6172 |
},
|
6173 |
"node_i": "499",
|
6174 |
-
"native_tokenizers": [
|
|
|
|
|
6175 |
"scripts": [
|
6176 |
"Arab",
|
6177 |
"Latn"
|
@@ -6182,9 +6221,19 @@
|
|
6182 |
"iso_1_code": "ar",
|
6183 |
"iso_3_code": "arq",
|
6184 |
"children": [],
|
6185 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6186 |
"node_i": "500",
|
6187 |
-
"native_tokenizers": [
|
|
|
|
|
6188 |
"scripts": [
|
6189 |
"Arab"
|
6190 |
]
|
@@ -6194,9 +6243,19 @@
|
|
6194 |
"iso_1_code": "ar",
|
6195 |
"iso_3_code": "ars",
|
6196 |
"children": [],
|
6197 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6198 |
"node_i": "501",
|
6199 |
-
"native_tokenizers": [
|
|
|
|
|
6200 |
"scripts": [
|
6201 |
"Arab"
|
6202 |
]
|
@@ -6206,9 +6265,19 @@
|
|
6206 |
"iso_1_code": "ar",
|
6207 |
"iso_3_code": "ary",
|
6208 |
"children": [],
|
6209 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6210 |
"node_i": "502",
|
6211 |
-
"native_tokenizers": [
|
|
|
|
|
6212 |
"scripts": [
|
6213 |
"Arab"
|
6214 |
]
|
@@ -6218,9 +6287,19 @@
|
|
6218 |
"iso_1_code": "ar",
|
6219 |
"iso_3_code": "arz",
|
6220 |
"children": [],
|
6221 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6222 |
"node_i": "503",
|
6223 |
-
"native_tokenizers": [
|
|
|
|
|
6224 |
"scripts": [
|
6225 |
"Arab"
|
6226 |
]
|
@@ -6280,9 +6359,19 @@
|
|
6280 |
"iso_1_code": "ar",
|
6281 |
"iso_3_code": "ayp",
|
6282 |
"children": [],
|
6283 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6284 |
"node_i": "509",
|
6285 |
-
"native_tokenizers": [
|
|
|
|
|
6286 |
"scripts": [
|
6287 |
"Arab"
|
6288 |
]
|
@@ -6334,9 +6423,19 @@
|
|
6334 |
"iso_1_code": "ar",
|
6335 |
"iso_3_code": "shu",
|
6336 |
"children": [],
|
6337 |
-
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6338 |
"node_i": "513",
|
6339 |
-
"native_tokenizers": [
|
|
|
|
|
6340 |
"scripts": [
|
6341 |
"Arab"
|
6342 |
]
|
@@ -6373,6 +6472,13 @@
|
|
6373 |
}
|
6374 |
],
|
6375 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6376 |
"Latn": {
|
6377 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6378 |
"original_lang_name": "maltese",
|
@@ -6460,6 +6566,13 @@
|
|
6460 |
}
|
6461 |
],
|
6462 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6463 |
"Latn": {
|
6464 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6465 |
"original_lang_name": "maltese",
|
@@ -6481,6 +6594,13 @@
|
|
6481 |
}
|
6482 |
],
|
6483 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6484 |
"Latn": {
|
6485 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6486 |
"original_lang_name": "maltese",
|
@@ -6951,6 +7071,13 @@
|
|
6951 |
}
|
6952 |
],
|
6953 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6954 |
"Latn": {
|
6955 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6956 |
"original_lang_name": "maltese",
|
@@ -7007,6 +7134,13 @@
|
|
7007 |
"script": "Copt",
|
7008 |
"class_name": "StanzaTokenizer"
|
7009 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7010 |
"Latn": {
|
7011 |
"full_object": "StanzaTokenizer(\"mt\")",
|
7012 |
"original_lang_name": "maltese",
|
|
|
6035 |
"iso_1_code": "ar",
|
6036 |
"iso_3_code": "acm",
|
6037 |
"children": [],
|
6038 |
+
"tokenizers": {
|
6039 |
+
"Arab": {
|
6040 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6041 |
+
"original_lang_name": "arabic",
|
6042 |
+
"original_lang_code": "ara",
|
6043 |
+
"script": "Arab",
|
6044 |
+
"class_name": "SpaCyTokenizer"
|
6045 |
+
}
|
6046 |
+
},
|
6047 |
"node_i": "487",
|
6048 |
+
"native_tokenizers": [
|
6049 |
+
"Arab"
|
6050 |
+
],
|
6051 |
"scripts": [
|
6052 |
"Arab"
|
6053 |
]
|
|
|
6107 |
"iso_1_code": "ar",
|
6108 |
"iso_3_code": "aeb",
|
6109 |
"children": [],
|
6110 |
+
"tokenizers": {
|
6111 |
+
"Arab": {
|
6112 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6113 |
+
"original_lang_name": "arabic",
|
6114 |
+
"original_lang_code": "ara",
|
6115 |
+
"script": "Arab",
|
6116 |
+
"class_name": "SpaCyTokenizer"
|
6117 |
+
}
|
6118 |
+
},
|
6119 |
"node_i": "493",
|
6120 |
+
"native_tokenizers": [
|
6121 |
+
"Arab"
|
6122 |
+
],
|
6123 |
"scripts": [
|
6124 |
"Arab"
|
6125 |
]
|
|
|
6159 |
"iso_1_code": "ar",
|
6160 |
"iso_3_code": "apc",
|
6161 |
"children": [],
|
6162 |
+
"tokenizers": {
|
6163 |
+
"Arab": {
|
6164 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6165 |
+
"original_lang_name": "arabic",
|
6166 |
+
"original_lang_code": "ara",
|
6167 |
+
"script": "Arab",
|
6168 |
+
"class_name": "SpaCyTokenizer"
|
6169 |
+
}
|
6170 |
+
},
|
6171 |
"node_i": "497",
|
6172 |
+
"native_tokenizers": [
|
6173 |
+
"Arab"
|
6174 |
+
],
|
6175 |
"scripts": [
|
6176 |
"Arab"
|
6177 |
]
|
|
|
6192 |
"iso_3_code": "arb",
|
6193 |
"children": [],
|
6194 |
"tokenizers": {
|
6195 |
+
"Arab": {
|
6196 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6197 |
+
"original_lang_name": "arabic",
|
6198 |
+
"original_lang_code": "ara",
|
6199 |
+
"script": "Arab",
|
6200 |
+
"class_name": "SpaCyTokenizer"
|
6201 |
+
},
|
6202 |
"Latn": {
|
6203 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6204 |
"original_lang_name": "maltese",
|
|
|
6208 |
}
|
6209 |
},
|
6210 |
"node_i": "499",
|
6211 |
+
"native_tokenizers": [
|
6212 |
+
"Arab"
|
6213 |
+
],
|
6214 |
"scripts": [
|
6215 |
"Arab",
|
6216 |
"Latn"
|
|
|
6221 |
"iso_1_code": "ar",
|
6222 |
"iso_3_code": "arq",
|
6223 |
"children": [],
|
6224 |
+
"tokenizers": {
|
6225 |
+
"Arab": {
|
6226 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6227 |
+
"original_lang_name": "arabic",
|
6228 |
+
"original_lang_code": "ara",
|
6229 |
+
"script": "Arab",
|
6230 |
+
"class_name": "SpaCyTokenizer"
|
6231 |
+
}
|
6232 |
+
},
|
6233 |
"node_i": "500",
|
6234 |
+
"native_tokenizers": [
|
6235 |
+
"Arab"
|
6236 |
+
],
|
6237 |
"scripts": [
|
6238 |
"Arab"
|
6239 |
]
|
|
|
6243 |
"iso_1_code": "ar",
|
6244 |
"iso_3_code": "ars",
|
6245 |
"children": [],
|
6246 |
+
"tokenizers": {
|
6247 |
+
"Arab": {
|
6248 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6249 |
+
"original_lang_name": "arabic",
|
6250 |
+
"original_lang_code": "ara",
|
6251 |
+
"script": "Arab",
|
6252 |
+
"class_name": "SpaCyTokenizer"
|
6253 |
+
}
|
6254 |
+
},
|
6255 |
"node_i": "501",
|
6256 |
+
"native_tokenizers": [
|
6257 |
+
"Arab"
|
6258 |
+
],
|
6259 |
"scripts": [
|
6260 |
"Arab"
|
6261 |
]
|
|
|
6265 |
"iso_1_code": "ar",
|
6266 |
"iso_3_code": "ary",
|
6267 |
"children": [],
|
6268 |
+
"tokenizers": {
|
6269 |
+
"Arab": {
|
6270 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6271 |
+
"original_lang_name": "arabic",
|
6272 |
+
"original_lang_code": "ara",
|
6273 |
+
"script": "Arab",
|
6274 |
+
"class_name": "SpaCyTokenizer"
|
6275 |
+
}
|
6276 |
+
},
|
6277 |
"node_i": "502",
|
6278 |
+
"native_tokenizers": [
|
6279 |
+
"Arab"
|
6280 |
+
],
|
6281 |
"scripts": [
|
6282 |
"Arab"
|
6283 |
]
|
|
|
6287 |
"iso_1_code": "ar",
|
6288 |
"iso_3_code": "arz",
|
6289 |
"children": [],
|
6290 |
+
"tokenizers": {
|
6291 |
+
"Arab": {
|
6292 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6293 |
+
"original_lang_name": "arabic",
|
6294 |
+
"original_lang_code": "ara",
|
6295 |
+
"script": "Arab",
|
6296 |
+
"class_name": "SpaCyTokenizer"
|
6297 |
+
}
|
6298 |
+
},
|
6299 |
"node_i": "503",
|
6300 |
+
"native_tokenizers": [
|
6301 |
+
"Arab"
|
6302 |
+
],
|
6303 |
"scripts": [
|
6304 |
"Arab"
|
6305 |
]
|
|
|
6359 |
"iso_1_code": "ar",
|
6360 |
"iso_3_code": "ayp",
|
6361 |
"children": [],
|
6362 |
+
"tokenizers": {
|
6363 |
+
"Arab": {
|
6364 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6365 |
+
"original_lang_name": "arabic",
|
6366 |
+
"original_lang_code": "ara",
|
6367 |
+
"script": "Arab",
|
6368 |
+
"class_name": "SpaCyTokenizer"
|
6369 |
+
}
|
6370 |
+
},
|
6371 |
"node_i": "509",
|
6372 |
+
"native_tokenizers": [
|
6373 |
+
"Arab"
|
6374 |
+
],
|
6375 |
"scripts": [
|
6376 |
"Arab"
|
6377 |
]
|
|
|
6423 |
"iso_1_code": "ar",
|
6424 |
"iso_3_code": "shu",
|
6425 |
"children": [],
|
6426 |
+
"tokenizers": {
|
6427 |
+
"Arab": {
|
6428 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6429 |
+
"original_lang_name": "arabic",
|
6430 |
+
"original_lang_code": "ara",
|
6431 |
+
"script": "Arab",
|
6432 |
+
"class_name": "SpaCyTokenizer"
|
6433 |
+
}
|
6434 |
+
},
|
6435 |
"node_i": "513",
|
6436 |
+
"native_tokenizers": [
|
6437 |
+
"Arab"
|
6438 |
+
],
|
6439 |
"scripts": [
|
6440 |
"Arab"
|
6441 |
]
|
|
|
6472 |
}
|
6473 |
],
|
6474 |
"tokenizers": {
|
6475 |
+
"Arab": {
|
6476 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6477 |
+
"original_lang_name": "arabic",
|
6478 |
+
"original_lang_code": "ara",
|
6479 |
+
"script": "Arab",
|
6480 |
+
"class_name": "SpaCyTokenizer"
|
6481 |
+
},
|
6482 |
"Latn": {
|
6483 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6484 |
"original_lang_name": "maltese",
|
|
|
6566 |
}
|
6567 |
],
|
6568 |
"tokenizers": {
|
6569 |
+
"Arab": {
|
6570 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6571 |
+
"original_lang_name": "arabic",
|
6572 |
+
"original_lang_code": "ara",
|
6573 |
+
"script": "Arab",
|
6574 |
+
"class_name": "SpaCyTokenizer"
|
6575 |
+
},
|
6576 |
"Latn": {
|
6577 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6578 |
"original_lang_name": "maltese",
|
|
|
6594 |
}
|
6595 |
],
|
6596 |
"tokenizers": {
|
6597 |
+
"Arab": {
|
6598 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
6599 |
+
"original_lang_name": "arabic",
|
6600 |
+
"original_lang_code": "ara",
|
6601 |
+
"script": "Arab",
|
6602 |
+
"class_name": "SpaCyTokenizer"
|
6603 |
+
},
|
6604 |
"Latn": {
|
6605 |
"full_object": "StanzaTokenizer(\"mt\")",
|
6606 |
"original_lang_name": "maltese",
|
|
|
7071 |
}
|
7072 |
],
|
7073 |
"tokenizers": {
|
7074 |
+
"Arab": {
|
7075 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
7076 |
+
"original_lang_name": "arabic",
|
7077 |
+
"original_lang_code": "ara",
|
7078 |
+
"script": "Arab",
|
7079 |
+
"class_name": "SpaCyTokenizer"
|
7080 |
+
},
|
7081 |
"Latn": {
|
7082 |
"full_object": "StanzaTokenizer(\"mt\")",
|
7083 |
"original_lang_name": "maltese",
|
|
|
7134 |
"script": "Copt",
|
7135 |
"class_name": "StanzaTokenizer"
|
7136 |
},
|
7137 |
+
"Arab": {
|
7138 |
+
"full_object": "SpaCyTokenizer(\"ar\")",
|
7139 |
+
"original_lang_name": "arabic",
|
7140 |
+
"original_lang_code": "ara",
|
7141 |
+
"script": "Arab",
|
7142 |
+
"class_name": "SpaCyTokenizer"
|
7143 |
+
},
|
7144 |
"Latn": {
|
7145 |
"full_object": "StanzaTokenizer(\"mt\")",
|
7146 |
"original_lang_name": "maltese",
|
data/Austronesian.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
data/Creole.json
CHANGED
@@ -833,11 +833,11 @@
|
|
833 |
"children": [],
|
834 |
"tokenizers": {
|
835 |
"Latn": {
|
836 |
-
"full_object": "
|
837 |
-
"original_lang_name": "
|
838 |
-
"original_lang_code": "
|
839 |
"script": "Latn",
|
840 |
-
"class_name": "
|
841 |
}
|
842 |
},
|
843 |
"node_i": "3540",
|
@@ -863,11 +863,11 @@
|
|
863 |
"children": [],
|
864 |
"tokenizers": {
|
865 |
"Latn": {
|
866 |
-
"full_object": "
|
867 |
-
"original_lang_name": "
|
868 |
-
"original_lang_code": "
|
869 |
"script": "Latn",
|
870 |
-
"class_name": "
|
871 |
}
|
872 |
},
|
873 |
"node_i": "3542",
|
@@ -883,11 +883,11 @@
|
|
883 |
"children": [],
|
884 |
"tokenizers": {
|
885 |
"Latn": {
|
886 |
-
"full_object": "
|
887 |
-
"original_lang_name": "
|
888 |
-
"original_lang_code": "
|
889 |
"script": "Latn",
|
890 |
-
"class_name": "
|
891 |
}
|
892 |
},
|
893 |
"node_i": "3543",
|
@@ -903,11 +903,11 @@
|
|
903 |
"children": [],
|
904 |
"tokenizers": {
|
905 |
"Latn": {
|
906 |
-
"full_object": "
|
907 |
-
"original_lang_name": "
|
908 |
-
"original_lang_code": "
|
909 |
"script": "Latn",
|
910 |
-
"class_name": "
|
911 |
}
|
912 |
},
|
913 |
"node_i": "3544",
|
@@ -923,11 +923,11 @@
|
|
923 |
"children": [],
|
924 |
"tokenizers": {
|
925 |
"Latn": {
|
926 |
-
"full_object": "
|
927 |
-
"original_lang_name": "
|
928 |
-
"original_lang_code": "
|
929 |
"script": "Latn",
|
930 |
-
"class_name": "
|
931 |
}
|
932 |
},
|
933 |
"node_i": "3545",
|
@@ -963,11 +963,11 @@
|
|
963 |
"children": [],
|
964 |
"tokenizers": {
|
965 |
"Latn": {
|
966 |
-
"full_object": "
|
967 |
-
"original_lang_name": "
|
968 |
-
"original_lang_code": "
|
969 |
"script": "Latn",
|
970 |
-
"class_name": "
|
971 |
}
|
972 |
},
|
973 |
"node_i": "3548",
|
@@ -983,11 +983,11 @@
|
|
983 |
"children": [],
|
984 |
"tokenizers": {
|
985 |
"Latn": {
|
986 |
-
"full_object": "
|
987 |
-
"original_lang_name": "
|
988 |
-
"original_lang_code": "
|
989 |
"script": "Latn",
|
990 |
-
"class_name": "
|
991 |
}
|
992 |
},
|
993 |
"node_i": "3549",
|
@@ -1009,11 +1009,11 @@
|
|
1009 |
],
|
1010 |
"tokenizers": {
|
1011 |
"Latn": {
|
1012 |
-
"full_object": "
|
1013 |
-
"original_lang_name": "
|
1014 |
-
"original_lang_code": "
|
1015 |
"script": "Latn",
|
1016 |
-
"class_name": "
|
1017 |
}
|
1018 |
},
|
1019 |
"node_i": "3539",
|
@@ -1074,11 +1074,11 @@
|
|
1074 |
"children": [],
|
1075 |
"tokenizers": {
|
1076 |
"Latn": {
|
1077 |
-
"full_object": "
|
1078 |
-
"original_lang_name": "
|
1079 |
-
"original_lang_code": "
|
1080 |
"script": "Latn",
|
1081 |
-
"class_name": "
|
1082 |
}
|
1083 |
},
|
1084 |
"node_i": "3556",
|
@@ -1090,11 +1090,11 @@
|
|
1090 |
],
|
1091 |
"tokenizers": {
|
1092 |
"Latn": {
|
1093 |
-
"full_object": "
|
1094 |
-
"original_lang_name": "
|
1095 |
-
"original_lang_code": "
|
1096 |
"script": "Latn",
|
1097 |
-
"class_name": "
|
1098 |
}
|
1099 |
},
|
1100 |
"node_i": "3555",
|
@@ -1134,11 +1134,11 @@
|
|
1134 |
"children": [],
|
1135 |
"tokenizers": {
|
1136 |
"Latn": {
|
1137 |
-
"full_object": "
|
1138 |
-
"original_lang_name": "
|
1139 |
-
"original_lang_code": "
|
1140 |
"script": "Latn",
|
1141 |
-
"class_name": "
|
1142 |
}
|
1143 |
},
|
1144 |
"node_i": "3560",
|
@@ -1160,11 +1160,11 @@
|
|
1160 |
],
|
1161 |
"tokenizers": {
|
1162 |
"Latn": {
|
1163 |
-
"full_object": "
|
1164 |
-
"original_lang_name": "
|
1165 |
-
"original_lang_code": "
|
1166 |
"script": "Latn",
|
1167 |
-
"class_name": "
|
1168 |
}
|
1169 |
},
|
1170 |
"node_i": "3559",
|
@@ -1183,11 +1183,11 @@
|
|
1183 |
"children": [],
|
1184 |
"tokenizers": {
|
1185 |
"Latn": {
|
1186 |
-
"full_object": "
|
1187 |
-
"original_lang_name": "
|
1188 |
-
"original_lang_code": "
|
1189 |
"script": "Latn",
|
1190 |
-
"class_name": "
|
1191 |
}
|
1192 |
},
|
1193 |
"node_i": "3563",
|
@@ -1203,11 +1203,11 @@
|
|
1203 |
"children": [],
|
1204 |
"tokenizers": {
|
1205 |
"Latn": {
|
1206 |
-
"full_object": "
|
1207 |
-
"original_lang_name": "
|
1208 |
-
"original_lang_code": "
|
1209 |
"script": "Latn",
|
1210 |
-
"class_name": "
|
1211 |
}
|
1212 |
},
|
1213 |
"node_i": "3564",
|
@@ -1263,15 +1263,17 @@
|
|
1263 |
"children": [],
|
1264 |
"tokenizers": {
|
1265 |
"Latn": {
|
1266 |
-
"full_object": "
|
1267 |
-
"original_lang_name": "
|
1268 |
-
"original_lang_code": "
|
1269 |
"script": "Latn",
|
1270 |
-
"class_name": "
|
1271 |
}
|
1272 |
},
|
1273 |
"node_i": "3569",
|
1274 |
-
"native_tokenizers": [
|
|
|
|
|
1275 |
"scripts": [
|
1276 |
"Latn"
|
1277 |
]
|
@@ -1283,11 +1285,11 @@
|
|
1283 |
"children": [],
|
1284 |
"tokenizers": {
|
1285 |
"Latn": {
|
1286 |
-
"full_object": "
|
1287 |
-
"original_lang_name": "
|
1288 |
-
"original_lang_code": "
|
1289 |
"script": "Latn",
|
1290 |
-
"class_name": "
|
1291 |
}
|
1292 |
},
|
1293 |
"node_i": "3570",
|
@@ -1313,11 +1315,11 @@
|
|
1313 |
"children": [],
|
1314 |
"tokenizers": {
|
1315 |
"Latn": {
|
1316 |
-
"full_object": "
|
1317 |
-
"original_lang_name": "
|
1318 |
-
"original_lang_code": "
|
1319 |
"script": "Latn",
|
1320 |
-
"class_name": "
|
1321 |
}
|
1322 |
},
|
1323 |
"node_i": "3572",
|
@@ -1363,15 +1365,17 @@
|
|
1363 |
"children": [],
|
1364 |
"tokenizers": {
|
1365 |
"Latn": {
|
1366 |
-
"full_object": "
|
1367 |
-
"original_lang_name": "
|
1368 |
-
"original_lang_code": "
|
1369 |
"script": "Latn",
|
1370 |
-
"class_name": "
|
1371 |
}
|
1372 |
},
|
1373 |
"node_i": "3576",
|
1374 |
-
"native_tokenizers": [
|
|
|
|
|
1375 |
"scripts": [
|
1376 |
"Latn"
|
1377 |
]
|
@@ -1379,11 +1383,11 @@
|
|
1379 |
],
|
1380 |
"tokenizers": {
|
1381 |
"Latn": {
|
1382 |
-
"full_object": "
|
1383 |
-
"original_lang_name": "
|
1384 |
-
"original_lang_code": "
|
1385 |
"script": "Latn",
|
1386 |
-
"class_name": "
|
1387 |
}
|
1388 |
},
|
1389 |
"node_i": "3562",
|
@@ -1402,11 +1406,11 @@
|
|
1402 |
"children": [],
|
1403 |
"tokenizers": {
|
1404 |
"Latn": {
|
1405 |
-
"full_object": "
|
1406 |
-
"original_lang_name": "
|
1407 |
-
"original_lang_code": "
|
1408 |
"script": "Latn",
|
1409 |
-
"class_name": "
|
1410 |
}
|
1411 |
},
|
1412 |
"node_i": "3578",
|
@@ -1428,11 +1432,11 @@
|
|
1428 |
],
|
1429 |
"tokenizers": {
|
1430 |
"Latn": {
|
1431 |
-
"full_object": "
|
1432 |
-
"original_lang_name": "
|
1433 |
-
"original_lang_code": "
|
1434 |
"script": "Latn",
|
1435 |
-
"class_name": "
|
1436 |
}
|
1437 |
},
|
1438 |
"node_i": "3577",
|
@@ -1471,11 +1475,11 @@
|
|
1471 |
"children": [],
|
1472 |
"tokenizers": {
|
1473 |
"Latn": {
|
1474 |
-
"full_object": "
|
1475 |
-
"original_lang_name": "
|
1476 |
-
"original_lang_code": "
|
1477 |
"script": "Latn",
|
1478 |
-
"class_name": "
|
1479 |
}
|
1480 |
},
|
1481 |
"node_i": "3583",
|
@@ -1511,11 +1515,11 @@
|
|
1511 |
"children": [],
|
1512 |
"tokenizers": {
|
1513 |
"Latn": {
|
1514 |
-
"full_object": "
|
1515 |
-
"original_lang_name": "
|
1516 |
-
"original_lang_code": "
|
1517 |
"script": "Latn",
|
1518 |
-
"class_name": "
|
1519 |
}
|
1520 |
},
|
1521 |
"node_i": "3586",
|
@@ -1551,11 +1555,11 @@
|
|
1551 |
"children": [],
|
1552 |
"tokenizers": {
|
1553 |
"Latn": {
|
1554 |
-
"full_object": "
|
1555 |
-
"original_lang_name": "
|
1556 |
-
"original_lang_code": "
|
1557 |
"script": "Latn",
|
1558 |
-
"class_name": "
|
1559 |
}
|
1560 |
},
|
1561 |
"node_i": "3589",
|
@@ -1607,11 +1611,11 @@
|
|
1607 |
],
|
1608 |
"tokenizers": {
|
1609 |
"Latn": {
|
1610 |
-
"full_object": "
|
1611 |
-
"original_lang_name": "
|
1612 |
-
"original_lang_code": "
|
1613 |
"script": "Latn",
|
1614 |
-
"class_name": "
|
1615 |
}
|
1616 |
},
|
1617 |
"node_i": "3580",
|
@@ -1630,11 +1634,11 @@
|
|
1630 |
"children": [],
|
1631 |
"tokenizers": {
|
1632 |
"Latn": {
|
1633 |
-
"full_object": "
|
1634 |
-
"original_lang_name": "
|
1635 |
-
"original_lang_code": "
|
1636 |
"script": "Latn",
|
1637 |
-
"class_name": "
|
1638 |
}
|
1639 |
},
|
1640 |
"node_i": "3595",
|
@@ -1656,11 +1660,11 @@
|
|
1656 |
],
|
1657 |
"tokenizers": {
|
1658 |
"Latn": {
|
1659 |
-
"full_object": "
|
1660 |
-
"original_lang_name": "
|
1661 |
-
"original_lang_code": "
|
1662 |
"script": "Latn",
|
1663 |
-
"class_name": "
|
1664 |
}
|
1665 |
},
|
1666 |
"node_i": "3594",
|
@@ -1700,11 +1704,11 @@
|
|
1700 |
"children": [],
|
1701 |
"tokenizers": {
|
1702 |
"Latn": {
|
1703 |
-
"full_object": "
|
1704 |
-
"original_lang_name": "
|
1705 |
-
"original_lang_code": "
|
1706 |
"script": "Latn",
|
1707 |
-
"class_name": "
|
1708 |
}
|
1709 |
},
|
1710 |
"node_i": "3600",
|
@@ -1716,11 +1720,11 @@
|
|
1716 |
],
|
1717 |
"tokenizers": {
|
1718 |
"Latn": {
|
1719 |
-
"full_object": "
|
1720 |
-
"original_lang_name": "
|
1721 |
-
"original_lang_code": "
|
1722 |
"script": "Latn",
|
1723 |
-
"class_name": "
|
1724 |
}
|
1725 |
},
|
1726 |
"node_i": "3599",
|
@@ -1730,11 +1734,11 @@
|
|
1730 |
],
|
1731 |
"tokenizers": {
|
1732 |
"Latn": {
|
1733 |
-
"full_object": "
|
1734 |
-
"original_lang_name": "
|
1735 |
-
"original_lang_code": "
|
1736 |
"script": "Latn",
|
1737 |
-
"class_name": "
|
1738 |
}
|
1739 |
},
|
1740 |
"node_i": "3481",
|
|
|
833 |
"children": [],
|
834 |
"tokenizers": {
|
835 |
"Latn": {
|
836 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
837 |
+
"original_lang_name": "malay",
|
838 |
+
"original_lang_code": "msa",
|
839 |
"script": "Latn",
|
840 |
+
"class_name": "SpaCyTokenizer"
|
841 |
}
|
842 |
},
|
843 |
"node_i": "3540",
|
|
|
863 |
"children": [],
|
864 |
"tokenizers": {
|
865 |
"Latn": {
|
866 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
867 |
+
"original_lang_name": "malay",
|
868 |
+
"original_lang_code": "msa",
|
869 |
"script": "Latn",
|
870 |
+
"class_name": "SpaCyTokenizer"
|
871 |
}
|
872 |
},
|
873 |
"node_i": "3542",
|
|
|
883 |
"children": [],
|
884 |
"tokenizers": {
|
885 |
"Latn": {
|
886 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
887 |
+
"original_lang_name": "malay",
|
888 |
+
"original_lang_code": "msa",
|
889 |
"script": "Latn",
|
890 |
+
"class_name": "SpaCyTokenizer"
|
891 |
}
|
892 |
},
|
893 |
"node_i": "3543",
|
|
|
903 |
"children": [],
|
904 |
"tokenizers": {
|
905 |
"Latn": {
|
906 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
907 |
+
"original_lang_name": "malay",
|
908 |
+
"original_lang_code": "msa",
|
909 |
"script": "Latn",
|
910 |
+
"class_name": "SpaCyTokenizer"
|
911 |
}
|
912 |
},
|
913 |
"node_i": "3544",
|
|
|
923 |
"children": [],
|
924 |
"tokenizers": {
|
925 |
"Latn": {
|
926 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
927 |
+
"original_lang_name": "malay",
|
928 |
+
"original_lang_code": "msa",
|
929 |
"script": "Latn",
|
930 |
+
"class_name": "SpaCyTokenizer"
|
931 |
}
|
932 |
},
|
933 |
"node_i": "3545",
|
|
|
963 |
"children": [],
|
964 |
"tokenizers": {
|
965 |
"Latn": {
|
966 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
967 |
+
"original_lang_name": "malay",
|
968 |
+
"original_lang_code": "msa",
|
969 |
"script": "Latn",
|
970 |
+
"class_name": "SpaCyTokenizer"
|
971 |
}
|
972 |
},
|
973 |
"node_i": "3548",
|
|
|
983 |
"children": [],
|
984 |
"tokenizers": {
|
985 |
"Latn": {
|
986 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
987 |
+
"original_lang_name": "malay",
|
988 |
+
"original_lang_code": "msa",
|
989 |
"script": "Latn",
|
990 |
+
"class_name": "SpaCyTokenizer"
|
991 |
}
|
992 |
},
|
993 |
"node_i": "3549",
|
|
|
1009 |
],
|
1010 |
"tokenizers": {
|
1011 |
"Latn": {
|
1012 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1013 |
+
"original_lang_name": "malay",
|
1014 |
+
"original_lang_code": "msa",
|
1015 |
"script": "Latn",
|
1016 |
+
"class_name": "SpaCyTokenizer"
|
1017 |
}
|
1018 |
},
|
1019 |
"node_i": "3539",
|
|
|
1074 |
"children": [],
|
1075 |
"tokenizers": {
|
1076 |
"Latn": {
|
1077 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1078 |
+
"original_lang_name": "malay",
|
1079 |
+
"original_lang_code": "msa",
|
1080 |
"script": "Latn",
|
1081 |
+
"class_name": "SpaCyTokenizer"
|
1082 |
}
|
1083 |
},
|
1084 |
"node_i": "3556",
|
|
|
1090 |
],
|
1091 |
"tokenizers": {
|
1092 |
"Latn": {
|
1093 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1094 |
+
"original_lang_name": "malay",
|
1095 |
+
"original_lang_code": "msa",
|
1096 |
"script": "Latn",
|
1097 |
+
"class_name": "SpaCyTokenizer"
|
1098 |
}
|
1099 |
},
|
1100 |
"node_i": "3555",
|
|
|
1134 |
"children": [],
|
1135 |
"tokenizers": {
|
1136 |
"Latn": {
|
1137 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1138 |
+
"original_lang_name": "malay",
|
1139 |
+
"original_lang_code": "msa",
|
1140 |
"script": "Latn",
|
1141 |
+
"class_name": "SpaCyTokenizer"
|
1142 |
}
|
1143 |
},
|
1144 |
"node_i": "3560",
|
|
|
1160 |
],
|
1161 |
"tokenizers": {
|
1162 |
"Latn": {
|
1163 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1164 |
+
"original_lang_name": "malay",
|
1165 |
+
"original_lang_code": "msa",
|
1166 |
"script": "Latn",
|
1167 |
+
"class_name": "SpaCyTokenizer"
|
1168 |
}
|
1169 |
},
|
1170 |
"node_i": "3559",
|
|
|
1183 |
"children": [],
|
1184 |
"tokenizers": {
|
1185 |
"Latn": {
|
1186 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1187 |
+
"original_lang_name": "malay",
|
1188 |
+
"original_lang_code": "msa",
|
1189 |
"script": "Latn",
|
1190 |
+
"class_name": "SpaCyTokenizer"
|
1191 |
}
|
1192 |
},
|
1193 |
"node_i": "3563",
|
|
|
1203 |
"children": [],
|
1204 |
"tokenizers": {
|
1205 |
"Latn": {
|
1206 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1207 |
+
"original_lang_name": "malay",
|
1208 |
+
"original_lang_code": "msa",
|
1209 |
"script": "Latn",
|
1210 |
+
"class_name": "SpaCyTokenizer"
|
1211 |
}
|
1212 |
},
|
1213 |
"node_i": "3564",
|
|
|
1263 |
"children": [],
|
1264 |
"tokenizers": {
|
1265 |
"Latn": {
|
1266 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1267 |
+
"original_lang_name": "malay",
|
1268 |
+
"original_lang_code": "msa",
|
1269 |
"script": "Latn",
|
1270 |
+
"class_name": "SpaCyTokenizer"
|
1271 |
}
|
1272 |
},
|
1273 |
"node_i": "3569",
|
1274 |
+
"native_tokenizers": [
|
1275 |
+
"Latn"
|
1276 |
+
],
|
1277 |
"scripts": [
|
1278 |
"Latn"
|
1279 |
]
|
|
|
1285 |
"children": [],
|
1286 |
"tokenizers": {
|
1287 |
"Latn": {
|
1288 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1289 |
+
"original_lang_name": "malay",
|
1290 |
+
"original_lang_code": "msa",
|
1291 |
"script": "Latn",
|
1292 |
+
"class_name": "SpaCyTokenizer"
|
1293 |
}
|
1294 |
},
|
1295 |
"node_i": "3570",
|
|
|
1315 |
"children": [],
|
1316 |
"tokenizers": {
|
1317 |
"Latn": {
|
1318 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1319 |
+
"original_lang_name": "malay",
|
1320 |
+
"original_lang_code": "msa",
|
1321 |
"script": "Latn",
|
1322 |
+
"class_name": "SpaCyTokenizer"
|
1323 |
}
|
1324 |
},
|
1325 |
"node_i": "3572",
|
|
|
1365 |
"children": [],
|
1366 |
"tokenizers": {
|
1367 |
"Latn": {
|
1368 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1369 |
+
"original_lang_name": "malay",
|
1370 |
+
"original_lang_code": "msa",
|
1371 |
"script": "Latn",
|
1372 |
+
"class_name": "SpaCyTokenizer"
|
1373 |
}
|
1374 |
},
|
1375 |
"node_i": "3576",
|
1376 |
+
"native_tokenizers": [
|
1377 |
+
"Latn"
|
1378 |
+
],
|
1379 |
"scripts": [
|
1380 |
"Latn"
|
1381 |
]
|
|
|
1383 |
],
|
1384 |
"tokenizers": {
|
1385 |
"Latn": {
|
1386 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1387 |
+
"original_lang_name": "malay",
|
1388 |
+
"original_lang_code": "msa",
|
1389 |
"script": "Latn",
|
1390 |
+
"class_name": "SpaCyTokenizer"
|
1391 |
}
|
1392 |
},
|
1393 |
"node_i": "3562",
|
|
|
1406 |
"children": [],
|
1407 |
"tokenizers": {
|
1408 |
"Latn": {
|
1409 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1410 |
+
"original_lang_name": "malay",
|
1411 |
+
"original_lang_code": "msa",
|
1412 |
"script": "Latn",
|
1413 |
+
"class_name": "SpaCyTokenizer"
|
1414 |
}
|
1415 |
},
|
1416 |
"node_i": "3578",
|
|
|
1432 |
],
|
1433 |
"tokenizers": {
|
1434 |
"Latn": {
|
1435 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1436 |
+
"original_lang_name": "malay",
|
1437 |
+
"original_lang_code": "msa",
|
1438 |
"script": "Latn",
|
1439 |
+
"class_name": "SpaCyTokenizer"
|
1440 |
}
|
1441 |
},
|
1442 |
"node_i": "3577",
|
|
|
1475 |
"children": [],
|
1476 |
"tokenizers": {
|
1477 |
"Latn": {
|
1478 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1479 |
+
"original_lang_name": "malay",
|
1480 |
+
"original_lang_code": "msa",
|
1481 |
"script": "Latn",
|
1482 |
+
"class_name": "SpaCyTokenizer"
|
1483 |
}
|
1484 |
},
|
1485 |
"node_i": "3583",
|
|
|
1515 |
"children": [],
|
1516 |
"tokenizers": {
|
1517 |
"Latn": {
|
1518 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1519 |
+
"original_lang_name": "malay",
|
1520 |
+
"original_lang_code": "msa",
|
1521 |
"script": "Latn",
|
1522 |
+
"class_name": "SpaCyTokenizer"
|
1523 |
}
|
1524 |
},
|
1525 |
"node_i": "3586",
|
|
|
1555 |
"children": [],
|
1556 |
"tokenizers": {
|
1557 |
"Latn": {
|
1558 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1559 |
+
"original_lang_name": "malay",
|
1560 |
+
"original_lang_code": "msa",
|
1561 |
"script": "Latn",
|
1562 |
+
"class_name": "SpaCyTokenizer"
|
1563 |
}
|
1564 |
},
|
1565 |
"node_i": "3589",
|
|
|
1611 |
],
|
1612 |
"tokenizers": {
|
1613 |
"Latn": {
|
1614 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1615 |
+
"original_lang_name": "malay",
|
1616 |
+
"original_lang_code": "msa",
|
1617 |
"script": "Latn",
|
1618 |
+
"class_name": "SpaCyTokenizer"
|
1619 |
}
|
1620 |
},
|
1621 |
"node_i": "3580",
|
|
|
1634 |
"children": [],
|
1635 |
"tokenizers": {
|
1636 |
"Latn": {
|
1637 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1638 |
+
"original_lang_name": "malay",
|
1639 |
+
"original_lang_code": "msa",
|
1640 |
"script": "Latn",
|
1641 |
+
"class_name": "SpaCyTokenizer"
|
1642 |
}
|
1643 |
},
|
1644 |
"node_i": "3595",
|
|
|
1660 |
],
|
1661 |
"tokenizers": {
|
1662 |
"Latn": {
|
1663 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1664 |
+
"original_lang_name": "malay",
|
1665 |
+
"original_lang_code": "msa",
|
1666 |
"script": "Latn",
|
1667 |
+
"class_name": "SpaCyTokenizer"
|
1668 |
}
|
1669 |
},
|
1670 |
"node_i": "3594",
|
|
|
1704 |
"children": [],
|
1705 |
"tokenizers": {
|
1706 |
"Latn": {
|
1707 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1708 |
+
"original_lang_name": "malay",
|
1709 |
+
"original_lang_code": "msa",
|
1710 |
"script": "Latn",
|
1711 |
+
"class_name": "SpaCyTokenizer"
|
1712 |
}
|
1713 |
},
|
1714 |
"node_i": "3600",
|
|
|
1720 |
],
|
1721 |
"tokenizers": {
|
1722 |
"Latn": {
|
1723 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1724 |
+
"original_lang_name": "malay",
|
1725 |
+
"original_lang_code": "msa",
|
1726 |
"script": "Latn",
|
1727 |
+
"class_name": "SpaCyTokenizer"
|
1728 |
}
|
1729 |
},
|
1730 |
"node_i": "3599",
|
|
|
1734 |
],
|
1735 |
"tokenizers": {
|
1736 |
"Latn": {
|
1737 |
+
"full_object": "SpaCyTokenizer(\"ms\")",
|
1738 |
+
"original_lang_name": "malay",
|
1739 |
+
"original_lang_code": "msa",
|
1740 |
"script": "Latn",
|
1741 |
+
"class_name": "SpaCyTokenizer"
|
1742 |
}
|
1743 |
},
|
1744 |
"node_i": "3481",
|
data/Indo-European.json
CHANGED
@@ -20,15 +20,17 @@
|
|
20 |
"children": [],
|
21 |
"tokenizers": {
|
22 |
"Latn": {
|
23 |
-
"full_object": "SpaCyTokenizer(\"
|
24 |
-
"original_lang_name": "
|
25 |
-
"original_lang_code": "
|
26 |
"script": "Latn",
|
27 |
"class_name": "SpaCyTokenizer"
|
28 |
}
|
29 |
},
|
30 |
"node_i": "3922",
|
31 |
-
"native_tokenizers": [
|
|
|
|
|
32 |
"scripts": [
|
33 |
"Latn"
|
34 |
]
|
@@ -36,9 +38,9 @@
|
|
36 |
],
|
37 |
"tokenizers": {
|
38 |
"Latn": {
|
39 |
-
"full_object": "SpaCyTokenizer(\"
|
40 |
-
"original_lang_name": "
|
41 |
-
"original_lang_code": "
|
42 |
"script": "Latn",
|
43 |
"class_name": "SpaCyTokenizer"
|
44 |
}
|
@@ -79,15 +81,17 @@
|
|
79 |
"children": [],
|
80 |
"tokenizers": {
|
81 |
"Latn": {
|
82 |
-
"full_object": "SpaCyTokenizer(\"
|
83 |
-
"original_lang_name": "
|
84 |
-
"original_lang_code": "
|
85 |
"script": "Latn",
|
86 |
"class_name": "SpaCyTokenizer"
|
87 |
}
|
88 |
},
|
89 |
"node_i": "3926",
|
90 |
-
"native_tokenizers": [
|
|
|
|
|
91 |
"scripts": [
|
92 |
"Latn"
|
93 |
]
|
@@ -95,9 +99,9 @@
|
|
95 |
],
|
96 |
"tokenizers": {
|
97 |
"Latn": {
|
98 |
-
"full_object": "SpaCyTokenizer(\"
|
99 |
-
"original_lang_name": "
|
100 |
-
"original_lang_code": "
|
101 |
"script": "Latn",
|
102 |
"class_name": "SpaCyTokenizer"
|
103 |
}
|
@@ -109,9 +113,9 @@
|
|
109 |
],
|
110 |
"tokenizers": {
|
111 |
"Latn": {
|
112 |
-
"full_object": "SpaCyTokenizer(\"
|
113 |
-
"original_lang_name": "
|
114 |
-
"original_lang_code": "
|
115 |
"script": "Latn",
|
116 |
"class_name": "SpaCyTokenizer"
|
117 |
}
|
@@ -227,15 +231,17 @@
|
|
227 |
"children": [],
|
228 |
"tokenizers": {
|
229 |
"Latn": {
|
230 |
-
"full_object": "SpaCyTokenizer(\"
|
231 |
-
"original_lang_name": "
|
232 |
-
"original_lang_code": "
|
233 |
"script": "Latn",
|
234 |
"class_name": "SpaCyTokenizer"
|
235 |
}
|
236 |
},
|
237 |
"node_i": "3934",
|
238 |
-
"native_tokenizers": [
|
|
|
|
|
239 |
"scripts": [
|
240 |
"Latn"
|
241 |
]
|
@@ -247,15 +253,17 @@
|
|
247 |
"children": [],
|
248 |
"tokenizers": {
|
249 |
"Latn": {
|
250 |
-
"full_object": "SpaCyTokenizer(\"
|
251 |
-
"original_lang_name": "
|
252 |
-
"original_lang_code": "
|
253 |
"script": "Latn",
|
254 |
"class_name": "SpaCyTokenizer"
|
255 |
}
|
256 |
},
|
257 |
"node_i": "3935",
|
258 |
-
"native_tokenizers": [
|
|
|
|
|
259 |
"scripts": [
|
260 |
"Latn"
|
261 |
]
|
@@ -267,9 +275,9 @@
|
|
267 |
"children": [],
|
268 |
"tokenizers": {
|
269 |
"Latn": {
|
270 |
-
"full_object": "SpaCyTokenizer(\"
|
271 |
-
"original_lang_name": "
|
272 |
-
"original_lang_code": "
|
273 |
"script": "Latn",
|
274 |
"class_name": "SpaCyTokenizer"
|
275 |
}
|
@@ -293,9 +301,9 @@
|
|
293 |
],
|
294 |
"tokenizers": {
|
295 |
"Latn": {
|
296 |
-
"full_object": "SpaCyTokenizer(\"
|
297 |
-
"original_lang_name": "
|
298 |
-
"original_lang_code": "
|
299 |
"script": "Latn",
|
300 |
"class_name": "SpaCyTokenizer"
|
301 |
}
|
@@ -316,9 +324,9 @@
|
|
316 |
"children": [],
|
317 |
"tokenizers": {
|
318 |
"Latn": {
|
319 |
-
"full_object": "SpaCyTokenizer(\"
|
320 |
-
"original_lang_name": "
|
321 |
-
"original_lang_code": "
|
322 |
"script": "Latn",
|
323 |
"class_name": "SpaCyTokenizer"
|
324 |
}
|
@@ -342,9 +350,9 @@
|
|
342 |
],
|
343 |
"tokenizers": {
|
344 |
"Latn": {
|
345 |
-
"full_object": "SpaCyTokenizer(\"
|
346 |
-
"original_lang_name": "
|
347 |
-
"original_lang_code": "
|
348 |
"script": "Latn",
|
349 |
"class_name": "SpaCyTokenizer"
|
350 |
}
|
@@ -356,9 +364,9 @@
|
|
356 |
],
|
357 |
"tokenizers": {
|
358 |
"Latn": {
|
359 |
-
"full_object": "SpaCyTokenizer(\"
|
360 |
-
"original_lang_name": "
|
361 |
-
"original_lang_code": "
|
362 |
"script": "Latn",
|
363 |
"class_name": "SpaCyTokenizer"
|
364 |
}
|
@@ -589,15 +597,17 @@
|
|
589 |
"children": [],
|
590 |
"tokenizers": {
|
591 |
"Latn": {
|
592 |
-
"full_object": "SpaCyTokenizer(\"
|
593 |
-
"original_lang_name": "
|
594 |
-
"original_lang_code": "
|
595 |
"script": "Latn",
|
596 |
"class_name": "SpaCyTokenizer"
|
597 |
}
|
598 |
},
|
599 |
"node_i": "3954",
|
600 |
-
"native_tokenizers": [
|
|
|
|
|
601 |
"scripts": [
|
602 |
"Latn"
|
603 |
]
|
@@ -609,9 +619,9 @@
|
|
609 |
"children": [],
|
610 |
"tokenizers": {
|
611 |
"Latn": {
|
612 |
-
"full_object": "SpaCyTokenizer(\"
|
613 |
-
"original_lang_name": "
|
614 |
-
"original_lang_code": "
|
615 |
"script": "Latn",
|
616 |
"class_name": "SpaCyTokenizer"
|
617 |
}
|
@@ -629,15 +639,17 @@
|
|
629 |
"children": [],
|
630 |
"tokenizers": {
|
631 |
"Latn": {
|
632 |
-
"full_object": "SpaCyTokenizer(\"
|
633 |
-
"original_lang_name": "
|
634 |
-
"original_lang_code": "
|
635 |
"script": "Latn",
|
636 |
"class_name": "SpaCyTokenizer"
|
637 |
}
|
638 |
},
|
639 |
"node_i": "3956",
|
640 |
-
"native_tokenizers": [
|
|
|
|
|
641 |
"scripts": [
|
642 |
"Latn"
|
643 |
]
|
@@ -700,16 +712,17 @@
|
|
700 |
"class_name": "SpaCyTokenizer"
|
701 |
},
|
702 |
"Latn": {
|
703 |
-
"full_object": "SpaCyTokenizer(\"
|
704 |
-
"original_lang_name": "
|
705 |
-
"original_lang_code": "
|
706 |
"script": "Latn",
|
707 |
"class_name": "SpaCyTokenizer"
|
708 |
}
|
709 |
},
|
710 |
"node_i": "3959",
|
711 |
"native_tokenizers": [
|
712 |
-
"Cyrl"
|
|
|
713 |
],
|
714 |
"scripts": [
|
715 |
"Cyrl",
|
@@ -729,9 +742,9 @@
|
|
729 |
],
|
730 |
"tokenizers": {
|
731 |
"Latn": {
|
732 |
-
"full_object": "SpaCyTokenizer(\"
|
733 |
-
"original_lang_name": "
|
734 |
-
"original_lang_code": "
|
735 |
"script": "Latn",
|
736 |
"class_name": "SpaCyTokenizer"
|
737 |
},
|
@@ -757,9 +770,9 @@
|
|
757 |
"class_name": "SpaCyTokenizer"
|
758 |
},
|
759 |
"Latn": {
|
760 |
-
"full_object": "SpaCyTokenizer(\"
|
761 |
-
"original_lang_name": "
|
762 |
-
"original_lang_code": "
|
763 |
"script": "Latn",
|
764 |
"class_name": "SpaCyTokenizer"
|
765 |
}
|
@@ -2694,9 +2707,9 @@
|
|
2694 |
"children": [],
|
2695 |
"tokenizers": {
|
2696 |
"Deva": {
|
2697 |
-
"full_object": "IndicNLPTokenizer(\"
|
2698 |
-
"original_lang_name": "
|
2699 |
-
"original_lang_code": "
|
2700 |
"script": "Deva",
|
2701 |
"class_name": "IndicNLPTokenizer"
|
2702 |
}
|
@@ -2744,9 +2757,9 @@
|
|
2744 |
"children": [],
|
2745 |
"tokenizers": {
|
2746 |
"Deva": {
|
2747 |
-
"full_object": "IndicNLPTokenizer(\"
|
2748 |
-
"original_lang_name": "
|
2749 |
-
"original_lang_code": "
|
2750 |
"script": "Deva",
|
2751 |
"class_name": "IndicNLPTokenizer"
|
2752 |
}
|
@@ -2780,9 +2793,9 @@
|
|
2780 |
],
|
2781 |
"tokenizers": {
|
2782 |
"Deva": {
|
2783 |
-
"full_object": "IndicNLPTokenizer(\"
|
2784 |
-
"original_lang_name": "
|
2785 |
-
"original_lang_code": "
|
2786 |
"script": "Deva",
|
2787 |
"class_name": "IndicNLPTokenizer"
|
2788 |
},
|
@@ -2810,15 +2823,17 @@
|
|
2810 |
"children": [],
|
2811 |
"tokenizers": {
|
2812 |
"Deva": {
|
2813 |
-
"full_object": "IndicNLPTokenizer(\"
|
2814 |
-
"original_lang_name": "
|
2815 |
-
"original_lang_code": "
|
2816 |
"script": "Deva",
|
2817 |
"class_name": "IndicNLPTokenizer"
|
2818 |
}
|
2819 |
},
|
2820 |
"node_i": "4080",
|
2821 |
-
"native_tokenizers": [
|
|
|
|
|
2822 |
"scripts": [
|
2823 |
"Deva"
|
2824 |
]
|
@@ -2840,9 +2855,9 @@
|
|
2840 |
"children": [],
|
2841 |
"tokenizers": {
|
2842 |
"Deva": {
|
2843 |
-
"full_object": "IndicNLPTokenizer(\"
|
2844 |
-
"original_lang_name": "
|
2845 |
-
"original_lang_code": "
|
2846 |
"script": "Deva",
|
2847 |
"class_name": "IndicNLPTokenizer"
|
2848 |
},
|
@@ -2855,7 +2870,9 @@
|
|
2855 |
}
|
2856 |
},
|
2857 |
"node_i": "4082",
|
2858 |
-
"native_tokenizers": [
|
|
|
|
|
2859 |
"scripts": [
|
2860 |
"Latn",
|
2861 |
"Deva"
|
@@ -2864,9 +2881,9 @@
|
|
2864 |
],
|
2865 |
"tokenizers": {
|
2866 |
"Deva": {
|
2867 |
-
"full_object": "IndicNLPTokenizer(\"
|
2868 |
-
"original_lang_name": "
|
2869 |
-
"original_lang_code": "
|
2870 |
"script": "Deva",
|
2871 |
"class_name": "IndicNLPTokenizer"
|
2872 |
},
|
@@ -2885,9 +2902,9 @@
|
|
2885 |
],
|
2886 |
"tokenizers": {
|
2887 |
"Deva": {
|
2888 |
-
"full_object": "IndicNLPTokenizer(\"
|
2889 |
-
"original_lang_name": "
|
2890 |
-
"original_lang_code": "
|
2891 |
"script": "Deva",
|
2892 |
"class_name": "IndicNLPTokenizer"
|
2893 |
},
|
@@ -3941,9 +3958,9 @@
|
|
3941 |
"children": [],
|
3942 |
"tokenizers": {
|
3943 |
"Deva": {
|
3944 |
-
"full_object": "IndicNLPTokenizer(\"
|
3945 |
-
"original_lang_name": "
|
3946 |
-
"original_lang_code": "
|
3947 |
"script": "Deva",
|
3948 |
"class_name": "IndicNLPTokenizer"
|
3949 |
}
|
@@ -3967,9 +3984,9 @@
|
|
3967 |
],
|
3968 |
"tokenizers": {
|
3969 |
"Deva": {
|
3970 |
-
"full_object": "IndicNLPTokenizer(\"
|
3971 |
-
"original_lang_name": "
|
3972 |
-
"original_lang_code": "
|
3973 |
"script": "Deva",
|
3974 |
"class_name": "IndicNLPTokenizer"
|
3975 |
}
|
@@ -3981,9 +3998,9 @@
|
|
3981 |
],
|
3982 |
"tokenizers": {
|
3983 |
"Deva": {
|
3984 |
-
"full_object": "IndicNLPTokenizer(\"
|
3985 |
-
"original_lang_name": "
|
3986 |
-
"original_lang_code": "
|
3987 |
"script": "Deva",
|
3988 |
"class_name": "IndicNLPTokenizer"
|
3989 |
}
|
@@ -4295,9 +4312,9 @@
|
|
4295 |
"class_name": "IndicNLPTokenizer"
|
4296 |
},
|
4297 |
"Deva": {
|
4298 |
-
"full_object": "IndicNLPTokenizer(\"
|
4299 |
-
"original_lang_name": "
|
4300 |
-
"original_lang_code": "
|
4301 |
"script": "Deva",
|
4302 |
"class_name": "IndicNLPTokenizer"
|
4303 |
},
|
@@ -4336,6 +4353,13 @@
|
|
4336 |
}
|
4337 |
],
|
4338 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4339 |
"Gujr": {
|
4340 |
"full_object": "IndicNLPTokenizer(\"gu\")",
|
4341 |
"original_lang_name": "gujarati",
|
@@ -4350,13 +4374,6 @@
|
|
4350 |
"script": "Guru",
|
4351 |
"class_name": "IndicNLPTokenizer"
|
4352 |
},
|
4353 |
-
"Deva": {
|
4354 |
-
"full_object": "IndicNLPTokenizer(\"hi\")",
|
4355 |
-
"original_lang_name": "hindi",
|
4356 |
-
"original_lang_code": "hin",
|
4357 |
-
"script": "Deva",
|
4358 |
-
"class_name": "IndicNLPTokenizer"
|
4359 |
-
},
|
4360 |
"Arab": {
|
4361 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
4362 |
"original_lang_name": "urdu",
|
@@ -4990,6 +5007,13 @@
|
|
4990 |
"iso_3_code": "ory",
|
4991 |
"children": [],
|
4992 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4993 |
"Latn": {
|
4994 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
4995 |
"original_lang_name": "northern_kurdish",
|
@@ -4999,7 +5023,9 @@
|
|
4999 |
}
|
5000 |
},
|
5001 |
"node_i": "4234",
|
5002 |
-
"native_tokenizers": [
|
|
|
|
|
5003 |
"scripts": [
|
5004 |
"Latn",
|
5005 |
"Orya"
|
@@ -5027,6 +5053,13 @@
|
|
5027 |
}
|
5028 |
],
|
5029 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5030 |
"Latn": {
|
5031 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
5032 |
"original_lang_name": "northern_kurdish",
|
@@ -5114,6 +5147,13 @@
|
|
5114 |
"script": "Deva",
|
5115 |
"class_name": "IndicNLPTokenizer"
|
5116 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5117 |
"Latn": {
|
5118 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
5119 |
"original_lang_name": "northern_kurdish",
|
@@ -5756,9 +5796,9 @@
|
|
5756 |
"children": [],
|
5757 |
"tokenizers": {
|
5758 |
"Deva": {
|
5759 |
-
"full_object": "IndicNLPTokenizer(\"
|
5760 |
-
"original_lang_name": "
|
5761 |
-
"original_lang_code": "
|
5762 |
"script": "Deva",
|
5763 |
"class_name": "IndicNLPTokenizer"
|
5764 |
},
|
@@ -5771,7 +5811,9 @@
|
|
5771 |
}
|
5772 |
},
|
5773 |
"node_i": "4287",
|
5774 |
-
"native_tokenizers": [
|
|
|
|
|
5775 |
"scripts": [
|
5776 |
"Latn",
|
5777 |
"Deva"
|
@@ -5784,9 +5826,9 @@
|
|
5784 |
"children": [],
|
5785 |
"tokenizers": {
|
5786 |
"Deva": {
|
5787 |
-
"full_object": "IndicNLPTokenizer(\"
|
5788 |
-
"original_lang_name": "
|
5789 |
-
"original_lang_code": "
|
5790 |
"script": "Deva",
|
5791 |
"class_name": "IndicNLPTokenizer"
|
5792 |
}
|
@@ -5850,9 +5892,9 @@
|
|
5850 |
],
|
5851 |
"tokenizers": {
|
5852 |
"Deva": {
|
5853 |
-
"full_object": "IndicNLPTokenizer(\"
|
5854 |
-
"original_lang_name": "
|
5855 |
-
"original_lang_code": "
|
5856 |
"script": "Deva",
|
5857 |
"class_name": "IndicNLPTokenizer"
|
5858 |
},
|
@@ -6026,6 +6068,13 @@
|
|
6026 |
"script": "Deva",
|
6027 |
"class_name": "IndicNLPTokenizer"
|
6028 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6029 |
"Arab": {
|
6030 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
6031 |
"original_lang_name": "sindhi",
|
@@ -6570,6 +6619,13 @@
|
|
6570 |
"script": "Beng",
|
6571 |
"class_name": "IndicNLPTokenizer"
|
6572 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6573 |
"Arab": {
|
6574 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
6575 |
"original_lang_name": "urdu",
|
@@ -8215,6 +8271,13 @@
|
|
8215 |
"script": "Beng",
|
8216 |
"class_name": "IndicNLPTokenizer"
|
8217 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8218 |
"Arab": {
|
8219 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
8220 |
"original_lang_name": "urdu",
|
@@ -9568,13 +9631,6 @@
|
|
9568 |
}
|
9569 |
],
|
9570 |
"tokenizers": {
|
9571 |
-
"Armn": {
|
9572 |
-
"full_object": "SpaCyTokenizer(\"hy\")",
|
9573 |
-
"original_lang_name": "armenian",
|
9574 |
-
"original_lang_code": "hye",
|
9575 |
-
"script": "Armn",
|
9576 |
-
"class_name": "SpaCyTokenizer"
|
9577 |
-
},
|
9578 |
"Latn": {
|
9579 |
"full_object": "SpaCyTokenizer(\"en\")",
|
9580 |
"original_lang_name": "english",
|
@@ -9582,6 +9638,13 @@
|
|
9582 |
"script": "Latn",
|
9583 |
"class_name": "SpaCyTokenizer"
|
9584 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9585 |
"Cyrl": {
|
9586 |
"full_object": "SpaCyTokenizer(\"ru\")",
|
9587 |
"original_lang_name": "russian",
|
@@ -9624,6 +9687,13 @@
|
|
9624 |
"script": "Beng",
|
9625 |
"class_name": "IndicNLPTokenizer"
|
9626 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9627 |
"Arab": {
|
9628 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
9629 |
"original_lang_name": "urdu",
|
|
|
20 |
"children": [],
|
21 |
"tokenizers": {
|
22 |
"Latn": {
|
23 |
+
"full_object": "SpaCyTokenizer(\"sq\")",
|
24 |
+
"original_lang_name": "albanian",
|
25 |
+
"original_lang_code": "sqi",
|
26 |
"script": "Latn",
|
27 |
"class_name": "SpaCyTokenizer"
|
28 |
}
|
29 |
},
|
30 |
"node_i": "3922",
|
31 |
+
"native_tokenizers": [
|
32 |
+
"Latn"
|
33 |
+
],
|
34 |
"scripts": [
|
35 |
"Latn"
|
36 |
]
|
|
|
38 |
],
|
39 |
"tokenizers": {
|
40 |
"Latn": {
|
41 |
+
"full_object": "SpaCyTokenizer(\"sq\")",
|
42 |
+
"original_lang_name": "albanian",
|
43 |
+
"original_lang_code": "sqi",
|
44 |
"script": "Latn",
|
45 |
"class_name": "SpaCyTokenizer"
|
46 |
}
|
|
|
81 |
"children": [],
|
82 |
"tokenizers": {
|
83 |
"Latn": {
|
84 |
+
"full_object": "SpaCyTokenizer(\"sq\")",
|
85 |
+
"original_lang_name": "albanian",
|
86 |
+
"original_lang_code": "sqi",
|
87 |
"script": "Latn",
|
88 |
"class_name": "SpaCyTokenizer"
|
89 |
}
|
90 |
},
|
91 |
"node_i": "3926",
|
92 |
+
"native_tokenizers": [
|
93 |
+
"Latn"
|
94 |
+
],
|
95 |
"scripts": [
|
96 |
"Latn"
|
97 |
]
|
|
|
99 |
],
|
100 |
"tokenizers": {
|
101 |
"Latn": {
|
102 |
+
"full_object": "SpaCyTokenizer(\"sq\")",
|
103 |
+
"original_lang_name": "albanian",
|
104 |
+
"original_lang_code": "sqi",
|
105 |
"script": "Latn",
|
106 |
"class_name": "SpaCyTokenizer"
|
107 |
}
|
|
|
113 |
],
|
114 |
"tokenizers": {
|
115 |
"Latn": {
|
116 |
+
"full_object": "SpaCyTokenizer(\"sq\")",
|
117 |
+
"original_lang_name": "albanian",
|
118 |
+
"original_lang_code": "sqi",
|
119 |
"script": "Latn",
|
120 |
"class_name": "SpaCyTokenizer"
|
121 |
}
|
|
|
231 |
"children": [],
|
232 |
"tokenizers": {
|
233 |
"Latn": {
|
234 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
235 |
+
"original_lang_name": "latvian",
|
236 |
+
"original_lang_code": "lav",
|
237 |
"script": "Latn",
|
238 |
"class_name": "SpaCyTokenizer"
|
239 |
}
|
240 |
},
|
241 |
"node_i": "3934",
|
242 |
+
"native_tokenizers": [
|
243 |
+
"Latn"
|
244 |
+
],
|
245 |
"scripts": [
|
246 |
"Latn"
|
247 |
]
|
|
|
253 |
"children": [],
|
254 |
"tokenizers": {
|
255 |
"Latn": {
|
256 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
257 |
+
"original_lang_name": "latvian",
|
258 |
+
"original_lang_code": "lav",
|
259 |
"script": "Latn",
|
260 |
"class_name": "SpaCyTokenizer"
|
261 |
}
|
262 |
},
|
263 |
"node_i": "3935",
|
264 |
+
"native_tokenizers": [
|
265 |
+
"Latn"
|
266 |
+
],
|
267 |
"scripts": [
|
268 |
"Latn"
|
269 |
]
|
|
|
275 |
"children": [],
|
276 |
"tokenizers": {
|
277 |
"Latn": {
|
278 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
279 |
+
"original_lang_name": "latvian",
|
280 |
+
"original_lang_code": "lav",
|
281 |
"script": "Latn",
|
282 |
"class_name": "SpaCyTokenizer"
|
283 |
}
|
|
|
301 |
],
|
302 |
"tokenizers": {
|
303 |
"Latn": {
|
304 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
305 |
+
"original_lang_name": "latvian",
|
306 |
+
"original_lang_code": "lav",
|
307 |
"script": "Latn",
|
308 |
"class_name": "SpaCyTokenizer"
|
309 |
}
|
|
|
324 |
"children": [],
|
325 |
"tokenizers": {
|
326 |
"Latn": {
|
327 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
328 |
+
"original_lang_name": "latvian",
|
329 |
+
"original_lang_code": "lav",
|
330 |
"script": "Latn",
|
331 |
"class_name": "SpaCyTokenizer"
|
332 |
}
|
|
|
350 |
],
|
351 |
"tokenizers": {
|
352 |
"Latn": {
|
353 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
354 |
+
"original_lang_name": "latvian",
|
355 |
+
"original_lang_code": "lav",
|
356 |
"script": "Latn",
|
357 |
"class_name": "SpaCyTokenizer"
|
358 |
}
|
|
|
364 |
],
|
365 |
"tokenizers": {
|
366 |
"Latn": {
|
367 |
+
"full_object": "SpaCyTokenizer(\"lv\")",
|
368 |
+
"original_lang_name": "latvian",
|
369 |
+
"original_lang_code": "lav",
|
370 |
"script": "Latn",
|
371 |
"class_name": "SpaCyTokenizer"
|
372 |
}
|
|
|
597 |
"children": [],
|
598 |
"tokenizers": {
|
599 |
"Latn": {
|
600 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
601 |
+
"original_lang_name": "serbocroatian",
|
602 |
+
"original_lang_code": "hbs",
|
603 |
"script": "Latn",
|
604 |
"class_name": "SpaCyTokenizer"
|
605 |
}
|
606 |
},
|
607 |
"node_i": "3954",
|
608 |
+
"native_tokenizers": [
|
609 |
+
"Latn"
|
610 |
+
],
|
611 |
"scripts": [
|
612 |
"Latn"
|
613 |
]
|
|
|
619 |
"children": [],
|
620 |
"tokenizers": {
|
621 |
"Latn": {
|
622 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
623 |
+
"original_lang_name": "serbocroatian",
|
624 |
+
"original_lang_code": "hbs",
|
625 |
"script": "Latn",
|
626 |
"class_name": "SpaCyTokenizer"
|
627 |
}
|
|
|
639 |
"children": [],
|
640 |
"tokenizers": {
|
641 |
"Latn": {
|
642 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
643 |
+
"original_lang_name": "serbocroatian",
|
644 |
+
"original_lang_code": "hbs",
|
645 |
"script": "Latn",
|
646 |
"class_name": "SpaCyTokenizer"
|
647 |
}
|
648 |
},
|
649 |
"node_i": "3956",
|
650 |
+
"native_tokenizers": [
|
651 |
+
"Latn"
|
652 |
+
],
|
653 |
"scripts": [
|
654 |
"Latn"
|
655 |
]
|
|
|
712 |
"class_name": "SpaCyTokenizer"
|
713 |
},
|
714 |
"Latn": {
|
715 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
716 |
+
"original_lang_name": "serbocroatian",
|
717 |
+
"original_lang_code": "hbs",
|
718 |
"script": "Latn",
|
719 |
"class_name": "SpaCyTokenizer"
|
720 |
}
|
721 |
},
|
722 |
"node_i": "3959",
|
723 |
"native_tokenizers": [
|
724 |
+
"Cyrl",
|
725 |
+
"Latn"
|
726 |
],
|
727 |
"scripts": [
|
728 |
"Cyrl",
|
|
|
742 |
],
|
743 |
"tokenizers": {
|
744 |
"Latn": {
|
745 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
746 |
+
"original_lang_name": "serbocroatian",
|
747 |
+
"original_lang_code": "hbs",
|
748 |
"script": "Latn",
|
749 |
"class_name": "SpaCyTokenizer"
|
750 |
},
|
|
|
770 |
"class_name": "SpaCyTokenizer"
|
771 |
},
|
772 |
"Latn": {
|
773 |
+
"full_object": "SpaCyTokenizer(\"sr\")",
|
774 |
+
"original_lang_name": "serbocroatian",
|
775 |
+
"original_lang_code": "hbs",
|
776 |
"script": "Latn",
|
777 |
"class_name": "SpaCyTokenizer"
|
778 |
}
|
|
|
2707 |
"children": [],
|
2708 |
"tokenizers": {
|
2709 |
"Deva": {
|
2710 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2711 |
+
"original_lang_name": "nepali",
|
2712 |
+
"original_lang_code": "nep",
|
2713 |
"script": "Deva",
|
2714 |
"class_name": "IndicNLPTokenizer"
|
2715 |
}
|
|
|
2757 |
"children": [],
|
2758 |
"tokenizers": {
|
2759 |
"Deva": {
|
2760 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2761 |
+
"original_lang_name": "nepali",
|
2762 |
+
"original_lang_code": "nep",
|
2763 |
"script": "Deva",
|
2764 |
"class_name": "IndicNLPTokenizer"
|
2765 |
}
|
|
|
2793 |
],
|
2794 |
"tokenizers": {
|
2795 |
"Deva": {
|
2796 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2797 |
+
"original_lang_name": "nepali",
|
2798 |
+
"original_lang_code": "nep",
|
2799 |
"script": "Deva",
|
2800 |
"class_name": "IndicNLPTokenizer"
|
2801 |
},
|
|
|
2823 |
"children": [],
|
2824 |
"tokenizers": {
|
2825 |
"Deva": {
|
2826 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2827 |
+
"original_lang_name": "nepali",
|
2828 |
+
"original_lang_code": "nep",
|
2829 |
"script": "Deva",
|
2830 |
"class_name": "IndicNLPTokenizer"
|
2831 |
}
|
2832 |
},
|
2833 |
"node_i": "4080",
|
2834 |
+
"native_tokenizers": [
|
2835 |
+
"Deva"
|
2836 |
+
],
|
2837 |
"scripts": [
|
2838 |
"Deva"
|
2839 |
]
|
|
|
2855 |
"children": [],
|
2856 |
"tokenizers": {
|
2857 |
"Deva": {
|
2858 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2859 |
+
"original_lang_name": "nepali",
|
2860 |
+
"original_lang_code": "nep",
|
2861 |
"script": "Deva",
|
2862 |
"class_name": "IndicNLPTokenizer"
|
2863 |
},
|
|
|
2870 |
}
|
2871 |
},
|
2872 |
"node_i": "4082",
|
2873 |
+
"native_tokenizers": [
|
2874 |
+
"Deva"
|
2875 |
+
],
|
2876 |
"scripts": [
|
2877 |
"Latn",
|
2878 |
"Deva"
|
|
|
2881 |
],
|
2882 |
"tokenizers": {
|
2883 |
"Deva": {
|
2884 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2885 |
+
"original_lang_name": "nepali",
|
2886 |
+
"original_lang_code": "nep",
|
2887 |
"script": "Deva",
|
2888 |
"class_name": "IndicNLPTokenizer"
|
2889 |
},
|
|
|
2902 |
],
|
2903 |
"tokenizers": {
|
2904 |
"Deva": {
|
2905 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
2906 |
+
"original_lang_name": "nepali",
|
2907 |
+
"original_lang_code": "nep",
|
2908 |
"script": "Deva",
|
2909 |
"class_name": "IndicNLPTokenizer"
|
2910 |
},
|
|
|
3958 |
"children": [],
|
3959 |
"tokenizers": {
|
3960 |
"Deva": {
|
3961 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
3962 |
+
"original_lang_name": "nepali",
|
3963 |
+
"original_lang_code": "nep",
|
3964 |
"script": "Deva",
|
3965 |
"class_name": "IndicNLPTokenizer"
|
3966 |
}
|
|
|
3984 |
],
|
3985 |
"tokenizers": {
|
3986 |
"Deva": {
|
3987 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
3988 |
+
"original_lang_name": "nepali",
|
3989 |
+
"original_lang_code": "nep",
|
3990 |
"script": "Deva",
|
3991 |
"class_name": "IndicNLPTokenizer"
|
3992 |
}
|
|
|
3998 |
],
|
3999 |
"tokenizers": {
|
4000 |
"Deva": {
|
4001 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
4002 |
+
"original_lang_name": "nepali",
|
4003 |
+
"original_lang_code": "nep",
|
4004 |
"script": "Deva",
|
4005 |
"class_name": "IndicNLPTokenizer"
|
4006 |
}
|
|
|
4312 |
"class_name": "IndicNLPTokenizer"
|
4313 |
},
|
4314 |
"Deva": {
|
4315 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
4316 |
+
"original_lang_name": "nepali",
|
4317 |
+
"original_lang_code": "nep",
|
4318 |
"script": "Deva",
|
4319 |
"class_name": "IndicNLPTokenizer"
|
4320 |
},
|
|
|
4353 |
}
|
4354 |
],
|
4355 |
"tokenizers": {
|
4356 |
+
"Deva": {
|
4357 |
+
"full_object": "IndicNLPTokenizer(\"ne\")",
|
4358 |
+
"original_lang_name": "nepali",
|
4359 |
+
"original_lang_code": "nep",
|
4360 |
+
"script": "Deva",
|
4361 |
+
"class_name": "IndicNLPTokenizer"
|
4362 |
+
},
|
4363 |
"Gujr": {
|
4364 |
"full_object": "IndicNLPTokenizer(\"gu\")",
|
4365 |
"original_lang_name": "gujarati",
|
|
|
4374 |
"script": "Guru",
|
4375 |
"class_name": "IndicNLPTokenizer"
|
4376 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4377 |
"Arab": {
|
4378 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
4379 |
"original_lang_name": "urdu",
|
|
|
5007 |
"iso_3_code": "ory",
|
5008 |
"children": [],
|
5009 |
"tokenizers": {
|
5010 |
+
"Orya": {
|
5011 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
5012 |
+
"original_lang_name": "oriya",
|
5013 |
+
"original_lang_code": "ori",
|
5014 |
+
"script": "Orya",
|
5015 |
+
"class_name": "IndicNLPTokenizer"
|
5016 |
+
},
|
5017 |
"Latn": {
|
5018 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
5019 |
"original_lang_name": "northern_kurdish",
|
|
|
5023 |
}
|
5024 |
},
|
5025 |
"node_i": "4234",
|
5026 |
+
"native_tokenizers": [
|
5027 |
+
"Orya"
|
5028 |
+
],
|
5029 |
"scripts": [
|
5030 |
"Latn",
|
5031 |
"Orya"
|
|
|
5053 |
}
|
5054 |
],
|
5055 |
"tokenizers": {
|
5056 |
+
"Orya": {
|
5057 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
5058 |
+
"original_lang_name": "oriya",
|
5059 |
+
"original_lang_code": "ori",
|
5060 |
+
"script": "Orya",
|
5061 |
+
"class_name": "IndicNLPTokenizer"
|
5062 |
+
},
|
5063 |
"Latn": {
|
5064 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
5065 |
"original_lang_name": "northern_kurdish",
|
|
|
5147 |
"script": "Deva",
|
5148 |
"class_name": "IndicNLPTokenizer"
|
5149 |
},
|
5150 |
+
"Orya": {
|
5151 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
5152 |
+
"original_lang_name": "oriya",
|
5153 |
+
"original_lang_code": "ori",
|
5154 |
+
"script": "Orya",
|
5155 |
+
"class_name": "IndicNLPTokenizer"
|
5156 |
+
},
|
5157 |
"Latn": {
|
5158 |
"full_object": "StanzaTokenizer(\"kmr\")",
|
5159 |
"original_lang_name": "northern_kurdish",
|
|
|
5796 |
"children": [],
|
5797 |
"tokenizers": {
|
5798 |
"Deva": {
|
5799 |
+
"full_object": "IndicNLPTokenizer(\"kK\")",
|
5800 |
+
"original_lang_name": "konkani",
|
5801 |
+
"original_lang_code": "kok",
|
5802 |
"script": "Deva",
|
5803 |
"class_name": "IndicNLPTokenizer"
|
5804 |
},
|
|
|
5811 |
}
|
5812 |
},
|
5813 |
"node_i": "4287",
|
5814 |
+
"native_tokenizers": [
|
5815 |
+
"Deva"
|
5816 |
+
],
|
5817 |
"scripts": [
|
5818 |
"Latn",
|
5819 |
"Deva"
|
|
|
5826 |
"children": [],
|
5827 |
"tokenizers": {
|
5828 |
"Deva": {
|
5829 |
+
"full_object": "IndicNLPTokenizer(\"kK\")",
|
5830 |
+
"original_lang_name": "konkani",
|
5831 |
+
"original_lang_code": "kok",
|
5832 |
"script": "Deva",
|
5833 |
"class_name": "IndicNLPTokenizer"
|
5834 |
}
|
|
|
5892 |
],
|
5893 |
"tokenizers": {
|
5894 |
"Deva": {
|
5895 |
+
"full_object": "IndicNLPTokenizer(\"kK\")",
|
5896 |
+
"original_lang_name": "konkani",
|
5897 |
+
"original_lang_code": "kok",
|
5898 |
"script": "Deva",
|
5899 |
"class_name": "IndicNLPTokenizer"
|
5900 |
},
|
|
|
6068 |
"script": "Deva",
|
6069 |
"class_name": "IndicNLPTokenizer"
|
6070 |
},
|
6071 |
+
"Orya": {
|
6072 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
6073 |
+
"original_lang_name": "oriya",
|
6074 |
+
"original_lang_code": "ori",
|
6075 |
+
"script": "Orya",
|
6076 |
+
"class_name": "IndicNLPTokenizer"
|
6077 |
+
},
|
6078 |
"Arab": {
|
6079 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
6080 |
"original_lang_name": "sindhi",
|
|
|
6619 |
"script": "Beng",
|
6620 |
"class_name": "IndicNLPTokenizer"
|
6621 |
},
|
6622 |
+
"Orya": {
|
6623 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
6624 |
+
"original_lang_name": "oriya",
|
6625 |
+
"original_lang_code": "ori",
|
6626 |
+
"script": "Orya",
|
6627 |
+
"class_name": "IndicNLPTokenizer"
|
6628 |
+
},
|
6629 |
"Arab": {
|
6630 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
6631 |
"original_lang_name": "urdu",
|
|
|
8271 |
"script": "Beng",
|
8272 |
"class_name": "IndicNLPTokenizer"
|
8273 |
},
|
8274 |
+
"Orya": {
|
8275 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
8276 |
+
"original_lang_name": "oriya",
|
8277 |
+
"original_lang_code": "ori",
|
8278 |
+
"script": "Orya",
|
8279 |
+
"class_name": "IndicNLPTokenizer"
|
8280 |
+
},
|
8281 |
"Arab": {
|
8282 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
8283 |
"original_lang_name": "urdu",
|
|
|
9631 |
}
|
9632 |
],
|
9633 |
"tokenizers": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9634 |
"Latn": {
|
9635 |
"full_object": "SpaCyTokenizer(\"en\")",
|
9636 |
"original_lang_name": "english",
|
|
|
9638 |
"script": "Latn",
|
9639 |
"class_name": "SpaCyTokenizer"
|
9640 |
},
|
9641 |
+
"Armn": {
|
9642 |
+
"full_object": "SpaCyTokenizer(\"hy\")",
|
9643 |
+
"original_lang_name": "armenian",
|
9644 |
+
"original_lang_code": "hye",
|
9645 |
+
"script": "Armn",
|
9646 |
+
"class_name": "SpaCyTokenizer"
|
9647 |
+
},
|
9648 |
"Cyrl": {
|
9649 |
"full_object": "SpaCyTokenizer(\"ru\")",
|
9650 |
"original_lang_name": "russian",
|
|
|
9687 |
"script": "Beng",
|
9688 |
"class_name": "IndicNLPTokenizer"
|
9689 |
},
|
9690 |
+
"Orya": {
|
9691 |
+
"full_object": "IndicNLPTokenizer(\"or\")",
|
9692 |
+
"original_lang_name": "oriya",
|
9693 |
+
"original_lang_code": "ori",
|
9694 |
+
"script": "Orya",
|
9695 |
+
"class_name": "IndicNLPTokenizer"
|
9696 |
+
},
|
9697 |
"Arab": {
|
9698 |
"full_object": "IndicNLPTokenizer(\"ur\")",
|
9699 |
"original_lang_name": "urdu",
|
data/Sino-Tibetan.json
CHANGED
@@ -35,15 +35,17 @@
|
|
35 |
"children": [],
|
36 |
"tokenizers": {
|
37 |
"Hani": {
|
38 |
-
"full_object": "
|
39 |
-
"original_lang_name": "
|
40 |
-
"original_lang_code": "
|
41 |
"script": "Hani",
|
42 |
-
"class_name": "
|
43 |
}
|
44 |
},
|
45 |
"node_i": "8922",
|
46 |
-
"native_tokenizers": [
|
|
|
|
|
47 |
"scripts": [
|
48 |
"Hani"
|
49 |
]
|
@@ -127,15 +129,17 @@
|
|
127 |
"children": [],
|
128 |
"tokenizers": {
|
129 |
"Hani": {
|
130 |
-
"full_object": "
|
131 |
-
"original_lang_name": "
|
132 |
-
"original_lang_code": "
|
133 |
"script": "Hani",
|
134 |
-
"class_name": "
|
135 |
}
|
136 |
},
|
137 |
"node_i": "8930",
|
138 |
-
"native_tokenizers": [
|
|
|
|
|
139 |
"scripts": [
|
140 |
"Latn",
|
141 |
"Hani"
|
@@ -200,15 +204,17 @@
|
|
200 |
"children": [],
|
201 |
"tokenizers": {
|
202 |
"Hani": {
|
203 |
-
"full_object": "
|
204 |
-
"original_lang_name": "
|
205 |
-
"original_lang_code": "
|
206 |
"script": "Hani",
|
207 |
-
"class_name": "
|
208 |
}
|
209 |
},
|
210 |
"node_i": "8935",
|
211 |
-
"native_tokenizers": [
|
|
|
|
|
212 |
"scripts": [
|
213 |
"Hani"
|
214 |
]
|
@@ -220,15 +226,17 @@
|
|
220 |
"children": [],
|
221 |
"tokenizers": {
|
222 |
"Hani": {
|
223 |
-
"full_object": "
|
224 |
-
"original_lang_name": "
|
225 |
-
"original_lang_code": "
|
226 |
"script": "Hani",
|
227 |
-
"class_name": "
|
228 |
}
|
229 |
},
|
230 |
"node_i": "8936",
|
231 |
-
"native_tokenizers": [
|
|
|
|
|
232 |
"scripts": [
|
233 |
"Hani"
|
234 |
]
|
@@ -236,11 +244,11 @@
|
|
236 |
],
|
237 |
"tokenizers": {
|
238 |
"Hani": {
|
239 |
-
"full_object": "
|
240 |
-
"original_lang_name": "
|
241 |
-
"original_lang_code": "
|
242 |
"script": "Hani",
|
243 |
-
"class_name": "
|
244 |
}
|
245 |
},
|
246 |
"node_i": "8919",
|
@@ -6211,11 +6219,11 @@
|
|
6211 |
],
|
6212 |
"tokenizers": {
|
6213 |
"Hani": {
|
6214 |
-
"full_object": "
|
6215 |
-
"original_lang_name": "
|
6216 |
-
"original_lang_code": "
|
6217 |
"script": "Hani",
|
6218 |
-
"class_name": "
|
6219 |
},
|
6220 |
"Deva": {
|
6221 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
|
|
35 |
"children": [],
|
36 |
"tokenizers": {
|
37 |
"Hani": {
|
38 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
39 |
+
"original_lang_name": "chinese",
|
40 |
+
"original_lang_code": "zho",
|
41 |
"script": "Hani",
|
42 |
+
"class_name": "SpaCyTokenizer"
|
43 |
}
|
44 |
},
|
45 |
"node_i": "8922",
|
46 |
+
"native_tokenizers": [
|
47 |
+
"Hani"
|
48 |
+
],
|
49 |
"scripts": [
|
50 |
"Hani"
|
51 |
]
|
|
|
129 |
"children": [],
|
130 |
"tokenizers": {
|
131 |
"Hani": {
|
132 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
133 |
+
"original_lang_name": "chinese",
|
134 |
+
"original_lang_code": "zho",
|
135 |
"script": "Hani",
|
136 |
+
"class_name": "SpaCyTokenizer"
|
137 |
}
|
138 |
},
|
139 |
"node_i": "8930",
|
140 |
+
"native_tokenizers": [
|
141 |
+
"Hani"
|
142 |
+
],
|
143 |
"scripts": [
|
144 |
"Latn",
|
145 |
"Hani"
|
|
|
204 |
"children": [],
|
205 |
"tokenizers": {
|
206 |
"Hani": {
|
207 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
208 |
+
"original_lang_name": "chinese",
|
209 |
+
"original_lang_code": "zho",
|
210 |
"script": "Hani",
|
211 |
+
"class_name": "SpaCyTokenizer"
|
212 |
}
|
213 |
},
|
214 |
"node_i": "8935",
|
215 |
+
"native_tokenizers": [
|
216 |
+
"Hani"
|
217 |
+
],
|
218 |
"scripts": [
|
219 |
"Hani"
|
220 |
]
|
|
|
226 |
"children": [],
|
227 |
"tokenizers": {
|
228 |
"Hani": {
|
229 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
230 |
+
"original_lang_name": "chinese",
|
231 |
+
"original_lang_code": "zho",
|
232 |
"script": "Hani",
|
233 |
+
"class_name": "SpaCyTokenizer"
|
234 |
}
|
235 |
},
|
236 |
"node_i": "8936",
|
237 |
+
"native_tokenizers": [
|
238 |
+
"Hani"
|
239 |
+
],
|
240 |
"scripts": [
|
241 |
"Hani"
|
242 |
]
|
|
|
244 |
],
|
245 |
"tokenizers": {
|
246 |
"Hani": {
|
247 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
248 |
+
"original_lang_name": "chinese",
|
249 |
+
"original_lang_code": "zho",
|
250 |
"script": "Hani",
|
251 |
+
"class_name": "SpaCyTokenizer"
|
252 |
}
|
253 |
},
|
254 |
"node_i": "8919",
|
|
|
6219 |
],
|
6220 |
"tokenizers": {
|
6221 |
"Hani": {
|
6222 |
+
"full_object": "SpaCyTokenizer(\"zh\", {\"nlp\": {\"tokenizer\": {\"segmenter\": \"jieba\"}}})",
|
6223 |
+
"original_lang_name": "chinese",
|
6224 |
+
"original_lang_code": "zho",
|
6225 |
"script": "Hani",
|
6226 |
+
"class_name": "SpaCyTokenizer"
|
6227 |
},
|
6228 |
"Deva": {
|
6229 |
"full_object": "IndicNLPTokenizer(\"hi\")",
|
data/Turkic.json
CHANGED
@@ -455,9 +455,9 @@
|
|
455 |
"children": [],
|
456 |
"tokenizers": {
|
457 |
"Latn": {
|
458 |
-
"full_object": "SpaCyTokenizer(\"
|
459 |
-
"original_lang_name": "
|
460 |
-
"original_lang_code": "
|
461 |
"script": "Latn",
|
462 |
"class_name": "SpaCyTokenizer"
|
463 |
},
|
@@ -470,7 +470,9 @@
|
|
470 |
}
|
471 |
},
|
472 |
"node_i": "10584",
|
473 |
-
"native_tokenizers": [
|
|
|
|
|
474 |
"scripts": [
|
475 |
"Latn",
|
476 |
"Cyrl"
|
@@ -479,9 +481,9 @@
|
|
479 |
],
|
480 |
"tokenizers": {
|
481 |
"Latn": {
|
482 |
-
"full_object": "SpaCyTokenizer(\"
|
483 |
-
"original_lang_name": "
|
484 |
-
"original_lang_code": "
|
485 |
"script": "Latn",
|
486 |
"class_name": "SpaCyTokenizer"
|
487 |
},
|
|
|
455 |
"children": [],
|
456 |
"tokenizers": {
|
457 |
"Latn": {
|
458 |
+
"full_object": "SpaCyTokenizer(\"az\")",
|
459 |
+
"original_lang_name": "azerbaijani",
|
460 |
+
"original_lang_code": "aze",
|
461 |
"script": "Latn",
|
462 |
"class_name": "SpaCyTokenizer"
|
463 |
},
|
|
|
470 |
}
|
471 |
},
|
472 |
"node_i": "10584",
|
473 |
+
"native_tokenizers": [
|
474 |
+
"Latn"
|
475 |
+
],
|
476 |
"scripts": [
|
477 |
"Latn",
|
478 |
"Cyrl"
|
|
|
481 |
],
|
482 |
"tokenizers": {
|
483 |
"Latn": {
|
484 |
+
"full_object": "SpaCyTokenizer(\"az\")",
|
485 |
+
"original_lang_name": "azerbaijani",
|
486 |
+
"original_lang_code": "aze",
|
487 |
"script": "Latn",
|
488 |
"class_name": "SpaCyTokenizer"
|
489 |
},
|
data/Uralic.json
CHANGED
@@ -77,15 +77,17 @@
|
|
77 |
"children": [],
|
78 |
"tokenizers": {
|
79 |
"Latn": {
|
80 |
-
"full_object": "SpaCyTokenizer(\"
|
81 |
-
"original_lang_name": "
|
82 |
-
"original_lang_code": "
|
83 |
"script": "Latn",
|
84 |
"class_name": "SpaCyTokenizer"
|
85 |
}
|
86 |
},
|
87 |
"node_i": "10680",
|
88 |
-
"native_tokenizers": [
|
|
|
|
|
89 |
"scripts": [
|
90 |
"Latn"
|
91 |
]
|
@@ -289,15 +291,17 @@
|
|
289 |
"children": [],
|
290 |
"tokenizers": {
|
291 |
"Latn": {
|
292 |
-
"full_object": "SpaCyTokenizer(\"
|
293 |
-
"original_lang_name": "
|
294 |
-
"original_lang_code": "
|
295 |
"script": "Latn",
|
296 |
"class_name": "SpaCyTokenizer"
|
297 |
}
|
298 |
},
|
299 |
"node_i": "10691",
|
300 |
-
"native_tokenizers": [
|
|
|
|
|
301 |
"scripts": [
|
302 |
"Latn"
|
303 |
]
|
|
|
77 |
"children": [],
|
78 |
"tokenizers": {
|
79 |
"Latn": {
|
80 |
+
"full_object": "SpaCyTokenizer(\"et\")",
|
81 |
+
"original_lang_name": "estonian",
|
82 |
+
"original_lang_code": "est",
|
83 |
"script": "Latn",
|
84 |
"class_name": "SpaCyTokenizer"
|
85 |
}
|
86 |
},
|
87 |
"node_i": "10680",
|
88 |
+
"native_tokenizers": [
|
89 |
+
"Latn"
|
90 |
+
],
|
91 |
"scripts": [
|
92 |
"Latn"
|
93 |
]
|
|
|
291 |
"children": [],
|
292 |
"tokenizers": {
|
293 |
"Latn": {
|
294 |
+
"full_object": "SpaCyTokenizer(\"et\")",
|
295 |
+
"original_lang_name": "estonian",
|
296 |
+
"original_lang_code": "est",
|
297 |
"script": "Latn",
|
298 |
"class_name": "SpaCyTokenizer"
|
299 |
}
|
300 |
},
|
301 |
"node_i": "10691",
|
302 |
+
"native_tokenizers": [
|
303 |
+
"Latn"
|
304 |
+
],
|
305 |
"scripts": [
|
306 |
"Latn"
|
307 |
]
|