import torch from transformers import Pipeline from transformers import AutoTokenizer from transformers.pipelines import PIPELINE_REGISTRY from transformers import pipeline from transformers import AutoModelForTokenClassification from huggingface_hub import Repository import sys import os class TokenizeAndAlignLabelsStep(): # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification def tokenize_and_align_labels(self, examples, tokenizer): tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=512) # Map tokens to their respective word. word_ids = tokenized_inputs.word_ids() previous_word_idx = None labels_mask = [] for word_idx in word_ids: # Set the special tokens to -100. if word_idx is None: labels_mask.append(False) # Only label the first token of a given word. elif word_idx != previous_word_idx: labels_mask.append(True) else: labels_mask.append(False) previous_word_idx = word_idx tokenized_inputs["tokens"] = examples tokenized_inputs["labels_mask"] = labels_mask return tokenized_inputs class BERT_CRF_Pipeline(Pipeline): def _sanitize_parameters(self, **kwargs): return {}, {}, {} def preprocess(self, text): tokenizer = AutoTokenizer.from_pretrained( "neuralmind/bert-base-portuguese-cased", do_lower_case=False) TokenizeAndAlignLabelsStep().tokenize_and_align_labels( examples=text, tokenizer=tokenizer) return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer) def _forward(self, tokenizer_results): input_ids = torch.tensor( tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) token_type_ids = torch.tensor( tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) attention_mask = torch.tensor( tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) labels_mask = torch.tensor( tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0) # input_ids, token_type_ids, attention_mask, labels, labels_mask outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=None, labels_mask=labels_mask) return outputs def postprocess(self, model_outputs): # From Ner_tags to Ner_labels for i, label in enumerate(model_outputs[0]): model_outputs[0][i] = self.model.config.id2label[label] return model_outputs[0] def main(): PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", pipeline_class=BERT_CRF_Pipeline, pt_model=AutoModelForTokenClassification, ) classifier = pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", model="arubenruben/PT-BERT-Large-CRF-Conll2003", device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True) out_path = os.path.join(sys.path[0], 'out', 'pipeline') repo = Repository( out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-Conll2003", use_auth_token=True) # repo.git_pull() classifier.save_pretrained(out_path) repo.push_to_hub()