File size: 2,051 Bytes
2f307a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
---
language: id
tags:
- indobert
- indobenchmark
---
## How to use
### Load model and tokenizer
```python
from transformers import AutoTokenizer, AutoModelForTokenClassification
tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner")
```
### Extract NER Tag
```python
import torch
def predict(model, tokenizer, sentence):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = tokenizer(sentence.split(),
is_split_into_words = True,
return_offsets_mapping=True,
return_tensors="pt",
padding='max_length',
truncation=True,
max_length=512)
model.to(device)
# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
#only predictions on first word pieces are important
if mapping[0] == 0 and mapping[1] != 0:
prediction.append(token_pred[1])
else:
continue
return sentence.split(), prediction
sentence = "BJ Habibie adalah Presiden Indonesia ke-3 yang lahir pada tanggl 25 Juni 1936"
words, labels = predict(model, tokenizer, sentence)
``` |