File size: 4,045 Bytes
d200bec b06c1b9 d200bec b06c1b9 d200bec f356772 d200bec f356772 d200bec b06c1b9 d200bec b06c1b9 d200bec b06c1b9 d200bec b06c1b9 d200bec b06c1b9 d200bec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import torch
from transformers import Pipeline
from transformers import AutoTokenizer
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from huggingface_hub import Repository
import sys
import os
class TokenizeAndAlignLabelsStep():
# Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
def tokenize_and_align_labels(self, examples, tokenizer):
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
# Map tokens to their respective word.
word_ids = tokenized_inputs.word_ids()
previous_word_idx = None
labels_mask = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
labels_mask.append(False)
# Only label the first token of a given word.
elif word_idx != previous_word_idx:
labels_mask.append(True)
else:
labels_mask.append(False)
previous_word_idx = word_idx
tokenized_inputs["labels_mask"] = labels_mask
return tokenized_inputs
class BERT_CRF_Pipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
return {}, {}, {}
def preprocess(self, inputs):
tokens = inputs['tokens']
tokenizer = AutoTokenizer.from_pretrained(
"neuralmind/bert-base-portuguese-cased", do_lower_case=False)
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer)
def _forward(self, tokenizer_results):
input_ids = torch.tensor(
tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
token_type_ids = torch.tensor(
tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
attention_mask = torch.tensor(
tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
labels_mask = torch.tensor(
tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
# input_ids, token_type_ids, attention_mask, labels, labels_mask
outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
return outputs
def postprocess(self, model_outputs):
# From Ner_tags to Ner_labels
for i, label in enumerate(model_outputs[0]):
model_outputs[0][i] = self.model.config.id2label[label]
return model_outputs[0]
def main():
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline",
pipeline_class=BERT_CRF_Pipeline,
pt_model=AutoModelForTokenClassification,
)
classifier = pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", model="arubenruben/PT-BERT-Large-CRF-Conll2003",
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
out_path = os.path.join(sys.path[0], 'out', 'pipeline')
repo = Repository(
out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-Conll2003", use_auth_token=True)
# repo.git_pull()
classifier.save_pretrained(out_path)
repo.push_to_hub() |