|
--- |
|
language: id |
|
tags: |
|
- indobert |
|
- indobenchmark |
|
--- |
|
|
|
## How to use |
|
|
|
### Load model and tokenizer |
|
```python |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") |
|
model = AutoModelForTokenClassification.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-ner") |
|
``` |
|
|
|
### Extract NER Tag |
|
```python |
|
import torch |
|
def predict(model, tokenizer, sentence): |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
inputs = tokenizer(sentence.split(), |
|
is_split_into_words = True, |
|
return_offsets_mapping=True, |
|
return_tensors="pt", |
|
padding='max_length', |
|
truncation=True, |
|
max_length=512) |
|
|
|
model.to(device) |
|
# move to gpu |
|
ids = inputs["input_ids"].to(device) |
|
mask = inputs["attention_mask"].to(device) |
|
|
|
# forward pass |
|
outputs = model(ids, attention_mask=mask) |
|
logits = outputs[0] |
|
|
|
active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels) |
|
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level |
|
|
|
tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist()) |
|
token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()] |
|
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction) |
|
|
|
prediction = [] |
|
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()): |
|
#only predictions on first word pieces are important |
|
if mapping[0] == 0 and mapping[1] != 0: |
|
prediction.append(token_pred[1]) |
|
else: |
|
continue |
|
|
|
return sentence.split(), prediction |
|
|
|
sentence = "BJ Habibie adalah Presiden Indonesia ke-3 yang lahir pada tanggl 25 Juni 1936" |
|
words, labels = predict(model, tokenizer, sentence) |
|
``` |