Files changed (4) hide show
  1. README.md +0 -57
  2. config.json +0 -119
  3. pytorch_model.bin +0 -3
  4. vocab.txt +0 -0
README.md DELETED
@@ -1,57 +0,0 @@
1
- ---
2
- language: id
3
- tags:
4
- - indobert
5
- - indobenchmark
6
- ---
7
-
8
- ## How to use
9
-
10
- ### Load model and tokenizer
11
- ```python
12
- from transformers import AutoTokenizer, AutoModel
13
-
14
- tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
15
- model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
16
- ```
17
-
18
- ### Extract NER Tag
19
- ```python
20
- import torch
21
-
22
- def predict(model, tokenizer, sentence):
23
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
- inputs = tokenizer(sentence.split(),
25
- is_split_into_words = True,
26
- return_offsets_mapping=True,
27
- return_tensors="pt")
28
-
29
- model.to(device)
30
- # move to gpu
31
- ids = inputs["input_ids"].to(device)
32
- mask = inputs["attention_mask"].to(device)
33
-
34
- # forward pass
35
- outputs = model(ids, attention_mask=mask)
36
- logits = outputs[0]
37
-
38
- active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
39
- flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
40
-
41
- tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
42
- token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
43
- wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
44
-
45
- prediction = []
46
- for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
47
- #only predictions on first word pieces are important
48
- if mapping[0] == 0 and mapping[1] != 0:
49
- prediction.append(token_pred[1])
50
- else:
51
- continue
52
-
53
- return sentence.split(), prediction
54
-
55
- sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
56
- words, labels = predict(model, tokenizer, sentence)
57
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,119 +0,0 @@
1
- {
2
- "_name_or_path": "indobenchmark/indobert-large-p2",
3
- "_num_labels": 5,
4
- "architectures": [
5
- "BertForTokenClassification"
6
- ],
7
- "attention_probs_dropout_prob": 0.1,
8
- "classifier_dropout": null,
9
- "directionality": "bidi",
10
- "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.1,
12
- "hidden_size": 1024,
13
- "id2label": {
14
- "0": "B-CC",
15
- "1": "B-CD",
16
- "2": "B-DT",
17
- "3": "B-FW",
18
- "4": "B-IN",
19
- "5": "B-JJ",
20
- "6": "B-MD",
21
- "7": "B-NEG",
22
- "8": "B-NN",
23
- "9": "B-NND",
24
- "10": "B-NNP",
25
- "11": "B-OD",
26
- "12": "B-PR",
27
- "13": "B-PRP",
28
- "14": "B-RB",
29
- "15": "B-RP",
30
- "16": "B-SC",
31
- "17": "B-SYM",
32
- "18": "B-UH",
33
- "19": "B-VB",
34
- "20": "B-WH",
35
- "21": "B-X",
36
- "22": "B-Z",
37
- "23": "I-CC",
38
- "24": "I-CD",
39
- "25": "I-IN",
40
- "26": "I-JJ",
41
- "27": "I-NN",
42
- "28": "I-NND",
43
- "29": "I-NNP",
44
- "30": "I-OD",
45
- "31": "I-PR",
46
- "32": "I-PRP",
47
- "33": "I-RB",
48
- "34": "I-SC",
49
- "35": "I-SYM",
50
- "36": "I-UH",
51
- "37": "I-VB",
52
- "38": "I-WH",
53
- "39": "I-X",
54
- "40": "I-Z"
55
- },
56
- "initializer_range": 0.02,
57
- "intermediate_size": 4096,
58
- "label2id": {
59
- "B-CC": 0,
60
- "B-CD": 1,
61
- "B-DT": 2,
62
- "B-FW": 3,
63
- "B-IN": 4,
64
- "B-JJ": 5,
65
- "B-MD": 6,
66
- "B-NEG": 7,
67
- "B-NN": 8,
68
- "B-NND": 9,
69
- "B-NNP": 10,
70
- "B-OD": 11,
71
- "B-PR": 12,
72
- "B-PRP": 13,
73
- "B-RB": 14,
74
- "B-RP": 15,
75
- "B-SC": 16,
76
- "B-SYM": 17,
77
- "B-UH": 18,
78
- "B-VB": 19,
79
- "B-WH": 20,
80
- "B-X": 21,
81
- "B-Z": 22,
82
- "I-CC": 23,
83
- "I-CD": 24,
84
- "I-IN": 25,
85
- "I-JJ": 26,
86
- "I-NN": 27,
87
- "I-NND": 28,
88
- "I-NNP": 29,
89
- "I-OD": 30,
90
- "I-PR": 31,
91
- "I-PRP": 32,
92
- "I-RB": 33,
93
- "I-SC": 34,
94
- "I-SYM": 35,
95
- "I-UH": 36,
96
- "I-VB": 37,
97
- "I-WH": 38,
98
- "I-X": 39,
99
- "I-Z": 40
100
- },
101
- "layer_norm_eps": 1e-12,
102
- "max_position_embeddings": 512,
103
- "model_type": "bert",
104
- "num_attention_heads": 16,
105
- "num_hidden_layers": 24,
106
- "output_past": true,
107
- "pad_token_id": 0,
108
- "pooler_fc_size": 768,
109
- "pooler_num_attention_heads": 12,
110
- "pooler_num_fc_layers": 3,
111
- "pooler_size_per_head": 128,
112
- "pooler_type": "first_token_transform",
113
- "position_embedding_type": "absolute",
114
- "torch_dtype": "float32",
115
- "transformers_version": "4.27.4",
116
- "type_vocab_size": 2,
117
- "use_cache": true,
118
- "vocab_size": 30522
119
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3e8dbdd5d8a3d6374caea594e779bd3b5c3b6e5e364396929aa510ca1abb63b
3
- size 1336675437
 
 
 
 
vocab.txt DELETED
The diff for this file is too large to render. See raw diff