ageng-anugrah
/

indobert-large-p2-finetuned-ner

+---
+language: id
+tags:
+- indobert
+- indobenchmark
+---
+## How to use
+### Load model and tokenizer
+```python
+from transformers import AutoTokenizer, AutoModel
+tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
+model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
+```
+### Extract NER Tag
+```python
+import torch
+def predict(model, tokenizer, sentence):
+    # will be moved to config later
+    ids_to_labels = {
+      0: 'B-ORGANISATION',
+      1: 'B-PERSON',
+      2: 'B-PLACE',
+      3: 'I-ORGANISATION',
+      4: 'I-PERSON',
+      5: 'I-PLACE',
+      6: 'O',
+    }
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    inputs = tokenizer(sentence.split(),
+                    is_split_into_words = True,
+                    return_offsets_mapping=True,
+                    return_tensors="pt")
+    model.to(device)
+    # move to gpu
+    ids = inputs["input_ids"].to(device)
+    mask = inputs["attention_mask"].to(device)
+    # forward pass
+    outputs = model(ids, attention_mask=mask)
+    logits = outputs[0]
+    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
+    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
+    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
+    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
+    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
+    prediction = []
+    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
+        #only predictions on first word pieces are important
+        if mapping[0] == 0 and mapping[1] != 0:
+            prediction.append(token_pred[1])
+        else:
+            continue
+    return sentence.split(), prediction
+sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
+words, labels = predict(model, tokenizer, sentence)
+```