master
#1
by
ageng-anugrah
- opened
- README.md +0 -57
- config.json +0 -119
- pytorch_model.bin +0 -3
- vocab.txt +0 -0
README.md
DELETED
@@ -1,57 +0,0 @@
|
|
1 |
-
---
|
2 |
-
language: id
|
3 |
-
tags:
|
4 |
-
- indobert
|
5 |
-
- indobenchmark
|
6 |
-
---
|
7 |
-
|
8 |
-
## How to use
|
9 |
-
|
10 |
-
### Load model and tokenizer
|
11 |
-
```python
|
12 |
-
from transformers import AutoTokenizer, AutoModel
|
13 |
-
|
14 |
-
tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
|
15 |
-
model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
|
16 |
-
```
|
17 |
-
|
18 |
-
### Extract NER Tag
|
19 |
-
```python
|
20 |
-
import torch
|
21 |
-
|
22 |
-
def predict(model, tokenizer, sentence):
|
23 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
24 |
-
inputs = tokenizer(sentence.split(),
|
25 |
-
is_split_into_words = True,
|
26 |
-
return_offsets_mapping=True,
|
27 |
-
return_tensors="pt")
|
28 |
-
|
29 |
-
model.to(device)
|
30 |
-
# move to gpu
|
31 |
-
ids = inputs["input_ids"].to(device)
|
32 |
-
mask = inputs["attention_mask"].to(device)
|
33 |
-
|
34 |
-
# forward pass
|
35 |
-
outputs = model(ids, attention_mask=mask)
|
36 |
-
logits = outputs[0]
|
37 |
-
|
38 |
-
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
|
39 |
-
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
|
40 |
-
|
41 |
-
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
|
42 |
-
token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
|
43 |
-
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
|
44 |
-
|
45 |
-
prediction = []
|
46 |
-
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
|
47 |
-
#only predictions on first word pieces are important
|
48 |
-
if mapping[0] == 0 and mapping[1] != 0:
|
49 |
-
prediction.append(token_pred[1])
|
50 |
-
else:
|
51 |
-
continue
|
52 |
-
|
53 |
-
return sentence.split(), prediction
|
54 |
-
|
55 |
-
sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
|
56 |
-
words, labels = predict(model, tokenizer, sentence)
|
57 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
DELETED
@@ -1,119 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"_name_or_path": "indobenchmark/indobert-large-p2",
|
3 |
-
"_num_labels": 5,
|
4 |
-
"architectures": [
|
5 |
-
"BertForTokenClassification"
|
6 |
-
],
|
7 |
-
"attention_probs_dropout_prob": 0.1,
|
8 |
-
"classifier_dropout": null,
|
9 |
-
"directionality": "bidi",
|
10 |
-
"hidden_act": "gelu",
|
11 |
-
"hidden_dropout_prob": 0.1,
|
12 |
-
"hidden_size": 1024,
|
13 |
-
"id2label": {
|
14 |
-
"0": "B-CC",
|
15 |
-
"1": "B-CD",
|
16 |
-
"2": "B-DT",
|
17 |
-
"3": "B-FW",
|
18 |
-
"4": "B-IN",
|
19 |
-
"5": "B-JJ",
|
20 |
-
"6": "B-MD",
|
21 |
-
"7": "B-NEG",
|
22 |
-
"8": "B-NN",
|
23 |
-
"9": "B-NND",
|
24 |
-
"10": "B-NNP",
|
25 |
-
"11": "B-OD",
|
26 |
-
"12": "B-PR",
|
27 |
-
"13": "B-PRP",
|
28 |
-
"14": "B-RB",
|
29 |
-
"15": "B-RP",
|
30 |
-
"16": "B-SC",
|
31 |
-
"17": "B-SYM",
|
32 |
-
"18": "B-UH",
|
33 |
-
"19": "B-VB",
|
34 |
-
"20": "B-WH",
|
35 |
-
"21": "B-X",
|
36 |
-
"22": "B-Z",
|
37 |
-
"23": "I-CC",
|
38 |
-
"24": "I-CD",
|
39 |
-
"25": "I-IN",
|
40 |
-
"26": "I-JJ",
|
41 |
-
"27": "I-NN",
|
42 |
-
"28": "I-NND",
|
43 |
-
"29": "I-NNP",
|
44 |
-
"30": "I-OD",
|
45 |
-
"31": "I-PR",
|
46 |
-
"32": "I-PRP",
|
47 |
-
"33": "I-RB",
|
48 |
-
"34": "I-SC",
|
49 |
-
"35": "I-SYM",
|
50 |
-
"36": "I-UH",
|
51 |
-
"37": "I-VB",
|
52 |
-
"38": "I-WH",
|
53 |
-
"39": "I-X",
|
54 |
-
"40": "I-Z"
|
55 |
-
},
|
56 |
-
"initializer_range": 0.02,
|
57 |
-
"intermediate_size": 4096,
|
58 |
-
"label2id": {
|
59 |
-
"B-CC": 0,
|
60 |
-
"B-CD": 1,
|
61 |
-
"B-DT": 2,
|
62 |
-
"B-FW": 3,
|
63 |
-
"B-IN": 4,
|
64 |
-
"B-JJ": 5,
|
65 |
-
"B-MD": 6,
|
66 |
-
"B-NEG": 7,
|
67 |
-
"B-NN": 8,
|
68 |
-
"B-NND": 9,
|
69 |
-
"B-NNP": 10,
|
70 |
-
"B-OD": 11,
|
71 |
-
"B-PR": 12,
|
72 |
-
"B-PRP": 13,
|
73 |
-
"B-RB": 14,
|
74 |
-
"B-RP": 15,
|
75 |
-
"B-SC": 16,
|
76 |
-
"B-SYM": 17,
|
77 |
-
"B-UH": 18,
|
78 |
-
"B-VB": 19,
|
79 |
-
"B-WH": 20,
|
80 |
-
"B-X": 21,
|
81 |
-
"B-Z": 22,
|
82 |
-
"I-CC": 23,
|
83 |
-
"I-CD": 24,
|
84 |
-
"I-IN": 25,
|
85 |
-
"I-JJ": 26,
|
86 |
-
"I-NN": 27,
|
87 |
-
"I-NND": 28,
|
88 |
-
"I-NNP": 29,
|
89 |
-
"I-OD": 30,
|
90 |
-
"I-PR": 31,
|
91 |
-
"I-PRP": 32,
|
92 |
-
"I-RB": 33,
|
93 |
-
"I-SC": 34,
|
94 |
-
"I-SYM": 35,
|
95 |
-
"I-UH": 36,
|
96 |
-
"I-VB": 37,
|
97 |
-
"I-WH": 38,
|
98 |
-
"I-X": 39,
|
99 |
-
"I-Z": 40
|
100 |
-
},
|
101 |
-
"layer_norm_eps": 1e-12,
|
102 |
-
"max_position_embeddings": 512,
|
103 |
-
"model_type": "bert",
|
104 |
-
"num_attention_heads": 16,
|
105 |
-
"num_hidden_layers": 24,
|
106 |
-
"output_past": true,
|
107 |
-
"pad_token_id": 0,
|
108 |
-
"pooler_fc_size": 768,
|
109 |
-
"pooler_num_attention_heads": 12,
|
110 |
-
"pooler_num_fc_layers": 3,
|
111 |
-
"pooler_size_per_head": 128,
|
112 |
-
"pooler_type": "first_token_transform",
|
113 |
-
"position_embedding_type": "absolute",
|
114 |
-
"torch_dtype": "float32",
|
115 |
-
"transformers_version": "4.27.4",
|
116 |
-
"type_vocab_size": 2,
|
117 |
-
"use_cache": true,
|
118 |
-
"vocab_size": 30522
|
119 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytorch_model.bin
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e3e8dbdd5d8a3d6374caea594e779bd3b5c3b6e5e364396929aa510ca1abb63b
|
3 |
-
size 1336675437
|
|
|
|
|
|
|
|
vocab.txt
DELETED
The diff for this file is too large to render.
See raw diff
|
|