File size: 4,045 Bytes
d200bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06c1b9
 
d200bec
 
 
 
 
b06c1b9
d200bec
 
 
 
 
 
 
f356772
d200bec
 
 
 
 
 
 
 
 
 
f356772
d200bec
 
 
 
 
b06c1b9
 
d200bec
 
 
b06c1b9
 
d200bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b06c1b9
d200bec
 
 
b06c1b9
d200bec
 
 
b06c1b9
d200bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import torch
from transformers import Pipeline
from transformers import AutoTokenizer
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
from transformers import AutoModelForTokenClassification
from huggingface_hub import Repository
import sys
import os


class TokenizeAndAlignLabelsStep():

    # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
    def tokenize_and_align_labels(self, examples, tokenizer):
                
        tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=128, is_split_into_words=True)
                
        # Map tokens to their respective word.
        word_ids = tokenized_inputs.word_ids()

        previous_word_idx = None
                
        labels_mask = []        
        
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:                    
                labels_mask.append(False)
            # Only label the first token of a given word.
            elif word_idx != previous_word_idx:                    
                labels_mask.append(True)                
            else:                    
                labels_mask.append(False)
            
            previous_word_idx = word_idx
                        
        tokenized_inputs["labels_mask"] = labels_mask

        return tokenized_inputs



class BERT_CRF_Pipeline(Pipeline):

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    def preprocess(self, inputs):
        tokens = inputs['tokens']

        tokenizer = AutoTokenizer.from_pretrained(
            "neuralmind/bert-base-portuguese-cased", do_lower_case=False)            
        
        return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=tokens, tokenizer=tokenizer)


    def _forward(self, tokenizer_results):

        input_ids = torch.tensor(
            tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        token_type_ids = torch.tensor(
            tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        attention_mask = torch.tensor(
            tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        labels_mask = torch.tensor(
            tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)

        # input_ids, token_type_ids, attention_mask, labels, labels_mask
        outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
                             attention_mask=attention_mask, labels=None, labels_mask=labels_mask)

        return outputs

    def postprocess(self, model_outputs):
        
        # From Ner_tags to Ner_labels
        for i, label in enumerate(model_outputs[0]):
            model_outputs[0][i] = self.model.config.id2label[label]
                
        return model_outputs[0]



def main():    

    PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline",
                                        pipeline_class=BERT_CRF_Pipeline,
                                        pt_model=AutoModelForTokenClassification,
                                        )
    classifier = pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", model="arubenruben/PT-BERT-Large-CRF-Conll2003",
                          device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
    out_path = os.path.join(sys.path[0], 'out', 'pipeline')
    repo = Repository(
        out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-Conll2003", use_auth_token=True)

    # repo.git_pull()

    classifier.save_pretrained(out_path)
    repo.push_to_hub()