master

by ageng-anugrah - opened Apr 6, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-30700

Files changed (4) hide show

README.md +0 -57
config.json +0 -119
pytorch_model.bin +0 -3
vocab.txt +0 -0

README.md DELETED Viewed

@@ -1,57 +0,0 @@
----
-language: id
-tags:
-- indobert
-- indobenchmark
----
-## How to use
-### Load model and tokenizer
-```python
-from transformers import AutoTokenizer, AutoModel
-tokenizer = AutoTokenizer.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
-model = AutoModel.from_pretrained("ageng-anugrah/indobert-large-p2-finetuned-pos")
-```
-### Extract NER Tag
-```python
-import torch
-def predict(model, tokenizer, sentence):
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    inputs = tokenizer(sentence.split(),
-                    is_split_into_words = True,
-                    return_offsets_mapping=True,
-                    return_tensors="pt")
-    model.to(device)
-    # move to gpu
-    ids = inputs["input_ids"].to(device)
-    mask = inputs["attention_mask"].to(device)
-    # forward pass
-    outputs = model(ids, attention_mask=mask)
-    logits = outputs[0]
-    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
-    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
-    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
-    token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
-    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
-    prediction = []
-    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
-        #only predictions on first word pieces are important
-        if mapping[0] == 0 and mapping[1] != 0:
-            prediction.append(token_pred[1])
-        else:
-            continue
-    return sentence.split(), prediction
-sentence = "BJ Habibie adalah Presiden Indonesia ke-3"
-words, labels = predict(model, tokenizer, sentence)
-```

config.json DELETED Viewed

@@ -1,119 +0,0 @@
-{
-  "_name_or_path": "indobenchmark/indobert-large-p2",
-  "_num_labels": 5,
-  "architectures": [
-    "BertForTokenClassification"
-  ],
-  "attention_probs_dropout_prob": 0.1,
-  "classifier_dropout": null,
-  "directionality": "bidi",
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
-  "hidden_size": 1024,
-  "id2label": {
-    "0": "B-CC",
-    "1": "B-CD",
-    "2": "B-DT",
-    "3": "B-FW",
-    "4": "B-IN",
-    "5": "B-JJ",
-    "6": "B-MD",
-    "7": "B-NEG",
-    "8": "B-NN",
-    "9": "B-NND",
-    "10": "B-NNP",
-    "11": "B-OD",
-    "12": "B-PR",
-    "13": "B-PRP",
-    "14": "B-RB",
-    "15": "B-RP",
-    "16": "B-SC",
-    "17": "B-SYM",
-    "18": "B-UH",
-    "19": "B-VB",
-    "20": "B-WH",
-    "21": "B-X",
-    "22": "B-Z",
-    "23": "I-CC",
-    "24": "I-CD",
-    "25": "I-IN",
-    "26": "I-JJ",
-    "27": "I-NN",
-    "28": "I-NND",
-    "29": "I-NNP",
-    "30": "I-OD",
-    "31": "I-PR",
-    "32": "I-PRP",
-    "33": "I-RB",
-    "34": "I-SC",
-    "35": "I-SYM",
-    "36": "I-UH",
-    "37": "I-VB",
-    "38": "I-WH",
-    "39": "I-X",
-    "40": "I-Z"
-  },
-  "initializer_range": 0.02,
-  "intermediate_size": 4096,
-  "label2id": {
-    "B-CC": 0,
-    "B-CD": 1,
-    "B-DT": 2,
-    "B-FW": 3,
-    "B-IN": 4,
-    "B-JJ": 5,
-    "B-MD": 6,
-    "B-NEG": 7,
-    "B-NN": 8,
-    "B-NND": 9,
-    "B-NNP": 10,
-    "B-OD": 11,
-    "B-PR": 12,
-    "B-PRP": 13,
-    "B-RB": 14,
-    "B-RP": 15,
-    "B-SC": 16,
-    "B-SYM": 17,
-    "B-UH": 18,
-    "B-VB": 19,
-    "B-WH": 20,
-    "B-X": 21,
-    "B-Z": 22,
-    "I-CC": 23,
-    "I-CD": 24,
-    "I-IN": 25,
-    "I-JJ": 26,
-    "I-NN": 27,
-    "I-NND": 28,
-    "I-NNP": 29,
-    "I-OD": 30,
-    "I-PR": 31,
-    "I-PRP": 32,
-    "I-RB": 33,
-    "I-SC": 34,
-    "I-SYM": 35,
-    "I-UH": 36,
-    "I-VB": 37,
-    "I-WH": 38,
-    "I-X": 39,
-    "I-Z": 40
-  },
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
-  "num_attention_heads": 16,
-  "num_hidden_layers": 24,
-  "output_past": true,
-  "pad_token_id": 0,
-  "pooler_fc_size": 768,
-  "pooler_num_attention_heads": 12,
-  "pooler_num_fc_layers": 3,
-  "pooler_size_per_head": 128,
-  "pooler_type": "first_token_transform",
-  "position_embedding_type": "absolute",
-  "torch_dtype": "float32",
-  "transformers_version": "4.27.4",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 30522
-}

pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e3e8dbdd5d8a3d6374caea594e779bd3b5c3b6e5e364396929aa510ca1abb63b
-size 1336675437

vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff