KoichiYasuoka commited on
Commit
e046a02
·
1 Parent(s): 47b0b91

initial release

Browse files
README.md CHANGED
@@ -1,3 +1,43 @@
1
  ---
2
- license: cc-by-sa-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - "sr"
4
+ tags:
5
+ - "serbian"
6
+ - "token-classification"
7
+ - "pos"
8
+ - "dependency-parsing"
9
+ datasets:
10
+ - "universal_dependencies"
11
+ license: "cc-by-sa-4.0"
12
+ pipeline_tag: "token-classification"
13
+ widget:
14
+ - text: "Да има сира и масла и моја би мати знала гибати гибаницу."
15
+ - text: "Da ima sira i masla i moja bi mati znala gibati gibanicu."
16
  ---
17
+
18
+ # roberta-base-serbian-upos
19
+
20
+ ## Model Description
21
+
22
+ This is a RoBERTa model in Serbian (Cyrillic and Latin) for POS-tagging and dependency-parsing, derived from [roberta-base-serbian](https://huggingface.co/KoichiYasuoka/roberta-base-serbian). Every word is tagged by [UPOS](https://universaldependencies.org/u/pos/) (Universal Part-Of-Speech).
23
+
24
+ ## How to Use
25
+
26
+ ```py
27
+ import torch
28
+ from transformers import AutoTokenizer,AutoModelForTokenClassification
29
+ tokenizer=AutoTokenizer.from_pretrained("KoichiYasuoka/roberta-base-serbian-upos")
30
+ model=AutoModelForTokenClassification.from_pretrained("KoichiYasuoka/roberta-base-serbian-upos")
31
+ ```
32
+
33
+ or
34
+
35
+ ```
36
+ import esupar
37
+ nlp=esupar.load("KoichiYasuoka/roberta-base-serbian-upos")
38
+ ```
39
+
40
+ ## See Also
41
+
42
+ [esupar](https://github.com/KoichiYasuoka/esupar): Tokenizer POS-tagger and Dependency-parser with BERT/RoBERTa models
43
+
config.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForTokenClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "ADJ",
14
+ "1": "ADP",
15
+ "2": "ADV",
16
+ "3": "AUX",
17
+ "4": "B-ADJ",
18
+ "5": "B-ADP",
19
+ "6": "B-ADV",
20
+ "7": "B-AUX",
21
+ "8": "B-DET",
22
+ "9": "B-INTJ",
23
+ "10": "B-NOUN",
24
+ "11": "B-NUM",
25
+ "12": "B-PART",
26
+ "13": "B-PRON",
27
+ "14": "B-PROPN",
28
+ "15": "B-PUNCT",
29
+ "16": "B-VERB",
30
+ "17": "B-X",
31
+ "18": "CCONJ",
32
+ "19": "DET",
33
+ "20": "I-ADJ",
34
+ "21": "I-ADP",
35
+ "22": "I-ADV",
36
+ "23": "I-AUX",
37
+ "24": "I-DET",
38
+ "25": "I-INTJ",
39
+ "26": "I-NOUN",
40
+ "27": "I-NUM",
41
+ "28": "I-PART",
42
+ "29": "I-PRON",
43
+ "30": "I-PROPN",
44
+ "31": "I-PUNCT",
45
+ "32": "I-VERB",
46
+ "33": "I-X",
47
+ "34": "INTJ",
48
+ "35": "NOUN",
49
+ "36": "NUM",
50
+ "37": "PART",
51
+ "38": "PRON",
52
+ "39": "PROPN",
53
+ "40": "PUNCT",
54
+ "41": "SCONJ",
55
+ "42": "SYM",
56
+ "43": "VERB",
57
+ "44": "X"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 3072,
61
+ "label2id": {
62
+ "ADJ": 0,
63
+ "ADP": 1,
64
+ "ADV": 2,
65
+ "AUX": 3,
66
+ "B-ADJ": 4,
67
+ "B-ADP": 5,
68
+ "B-ADV": 6,
69
+ "B-AUX": 7,
70
+ "B-DET": 8,
71
+ "B-INTJ": 9,
72
+ "B-NOUN": 10,
73
+ "B-NUM": 11,
74
+ "B-PART": 12,
75
+ "B-PRON": 13,
76
+ "B-PROPN": 14,
77
+ "B-PUNCT": 15,
78
+ "B-VERB": 16,
79
+ "B-X": 17,
80
+ "CCONJ": 18,
81
+ "DET": 19,
82
+ "I-ADJ": 20,
83
+ "I-ADP": 21,
84
+ "I-ADV": 22,
85
+ "I-AUX": 23,
86
+ "I-DET": 24,
87
+ "I-INTJ": 25,
88
+ "I-NOUN": 26,
89
+ "I-NUM": 27,
90
+ "I-PART": 28,
91
+ "I-PRON": 29,
92
+ "I-PROPN": 30,
93
+ "I-PUNCT": 31,
94
+ "I-VERB": 32,
95
+ "I-X": 33,
96
+ "INTJ": 34,
97
+ "NOUN": 35,
98
+ "NUM": 36,
99
+ "PART": 37,
100
+ "PRON": 38,
101
+ "PROPN": 39,
102
+ "PUNCT": 40,
103
+ "SCONJ": 41,
104
+ "SYM": 42,
105
+ "VERB": 43,
106
+ "X": 44
107
+ },
108
+ "layer_norm_eps": 1e-12,
109
+ "max_position_embeddings": 512,
110
+ "model_type": "roberta",
111
+ "num_attention_heads": 12,
112
+ "num_hidden_layers": 12,
113
+ "pad_token_id": 1,
114
+ "position_embedding_type": "absolute",
115
+ "tokenizer_class": "BertTokenizerFast",
116
+ "torch_dtype": "float32",
117
+ "transformers_version": "4.18.0",
118
+ "type_vocab_size": 2,
119
+ "use_cache": true,
120
+ "vocab_size": 14438
121
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6dd000b5c285446ada737f34e597cacd32c76e88635d618a112ad144ff9aa46c
3
+ size 386365937
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
supar.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb2c0bdcda63d436c2524669770e88100f33b8ad3b00acfc85cb0c60547c0441
3
+ size 440059877
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"do_lower_case": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": false, "do_lowercase": true, "never_split": ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"], "model_max_length": 512, "do_basic_tokenize": true, "tokenizer_class": "BertTokenizerFast"}
vocab.txt ADDED
The diff for this file is too large to render. See raw diff