|
import torch |
|
from transformers import Pipeline |
|
from transformers import AutoTokenizer |
|
from transformers.pipelines import PIPELINE_REGISTRY |
|
from transformers import pipeline |
|
from transformers import AutoModelForTokenClassification |
|
from huggingface_hub import Repository |
|
import sys |
|
import os |
|
|
|
|
|
class TokenizeAndAlignLabelsStep(): |
|
|
|
|
|
def tokenize_and_align_labels(self, examples, tokenizer): |
|
|
|
tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128) |
|
|
|
|
|
word_ids = tokenized_inputs.word_ids() |
|
|
|
previous_word_idx = None |
|
|
|
labels_mask = [] |
|
|
|
for word_idx in word_ids: |
|
if word_idx is None: |
|
labels_mask.append(False) |
|
|
|
elif word_idx != previous_word_idx: |
|
labels_mask.append(True) |
|
else: |
|
labels_mask.append(False) |
|
|
|
previous_word_idx = word_idx |
|
|
|
tokenized_inputs["tokens"] = examples |
|
tokenized_inputs["labels_mask"] = labels_mask |
|
|
|
return tokenized_inputs |
|
|
|
|
|
class BERT_CRF_Pipeline(Pipeline): |
|
|
|
def _sanitize_parameters(self, **kwargs): |
|
return {}, {}, {} |
|
|
|
def preprocess(self, text): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"neuralmind/bert-base-portuguese-cased", do_lower_case=False) |
|
|
|
TokenizeAndAlignLabelsStep().tokenize_and_align_labels( |
|
examples=text, tokenizer=tokenizer) |
|
|
|
return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer) |
|
|
|
def _forward(self, tokenizer_results): |
|
|
|
input_ids = torch.tensor( |
|
tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
|
token_type_ids = torch.tensor( |
|
tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
|
attention_mask = torch.tensor( |
|
tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
|
labels_mask = torch.tensor( |
|
tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) |
|
|
|
|
|
outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids, |
|
attention_mask=attention_mask, labels=None, labels_mask=labels_mask) |
|
|
|
return outputs |
|
|
|
def postprocess(self, model_outputs): |
|
|
|
for i, label in enumerate(model_outputs[0]): |
|
model_outputs[0][i] = self.model.config.id2label[label] |
|
|
|
return model_outputs[0] |
|
|
|
|
|
def main(): |
|
|
|
PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", |
|
pipeline_class=BERT_CRF_Pipeline, |
|
pt_model=AutoModelForTokenClassification, |
|
) |
|
classifier = pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", model="arubenruben/PT-BERT-Large-CRF-Conll2003", |
|
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True) |
|
out_path = os.path.join(sys.path[0], 'out', 'pipeline') |
|
repo = Repository( |
|
out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-Conll2003", use_auth_token=True) |
|
|
|
|
|
|
|
classifier.save_pretrained(out_path) |
|
repo.push_to_hub() |