ageng-anugrah commited on
Commit
2d8940e
1 Parent(s): 0e08679

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +57 -0
README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: id
3
+ tags:
4
+ - indobert
5
+ - indobenchmark
6
+ ---
7
+
8
+ ## How to use
9
+
10
+ ### Load model and tokenizer
11
+ ```python
12
+ from transformers import AutoTokenizer, AutoModel
13
+
14
+ tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
15
+ model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
16
+ ```
17
+
18
+ ### Extract NER Tag
19
+ ```python
20
+ import torch
21
+
22
+ def predict(model, tokenizer, sentence):
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ inputs = tokenizer(sentence.split(),
25
+ is_split_into_words = True,
26
+ return_offsets_mapping=True,
27
+ return_tensors="pt")
28
+
29
+ model.to(device)
30
+ # move to gpu
31
+ ids = inputs["input_ids"].to(device)
32
+ mask = inputs["attention_mask"].to(device)
33
+
34
+ # forward pass
35
+ outputs = model(ids, attention_mask=mask)
36
+ logits = outputs[0]
37
+
38
+ active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
39
+ flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
40
+
41
+ tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
42
+ token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
43
+ wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
44
+
45
+ prediction = []
46
+ for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
47
+ #only predictions on first word pieces are important
48
+ if mapping[0] == 0 and mapping[1] != 0:
49
+ prediction.append(token_pred[1])
50
+ else:
51
+ continue
52
+
53
+ return sentence.split(), prediction
54
+
55
+ sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
56
+ words, labels = predict(model, tokenizer, sentence)
57
+ ```