arubenruben commited on
Commit
d200bec
·
1 Parent(s): eb2d250

commit files to HF hub

Browse files
Files changed (3) hide show
  1. config.json +12 -3
  2. deploy_pipeline.py +101 -0
  3. pytorch_model.bin +2 -2
config.json CHANGED
@@ -1,13 +1,22 @@
1
  {
2
- "_name_or_path": "/notebooks/src/hugging_face_pipeline/BERT-CRF/out/model",
3
  "architectures": [
4
  "BERT_CRF"
5
  ],
6
  "auto_map": {
7
- "AutoConfig": "model.BERT_CRF_Config",
8
- "AutoModelForTokenClassification": "model.BERT_CRF"
9
  },
10
  "bert_name": "neuralmind/bert-large-portuguese-cased",
 
 
 
 
 
 
 
 
 
11
  "id2label": {
12
  "0": "O",
13
  "1": "B-PER",
 
1
  {
2
+ "_name_or_path": "arubenruben/PT-BERT-Large-CRF-Conll2003",
3
  "architectures": [
4
  "BERT_CRF"
5
  ],
6
  "auto_map": {
7
+ "AutoConfig": "arubenruben/PT-BERT-Large-CRF-Conll2003--model.BERT_CRF_Config",
8
+ "AutoModelForTokenClassification": "arubenruben/PT-BERT-Large-CRF-Conll2003--model.BERT_CRF"
9
  },
10
  "bert_name": "neuralmind/bert-large-portuguese-cased",
11
+ "custom_pipelines": {
12
+ "PT-BERT-Large-CRF-Conll2003-pipeline": {
13
+ "impl": "deploy_pipeline.BERT_CRF_Pipeline",
14
+ "pt": [
15
+ "AutoModelForTokenClassification"
16
+ ],
17
+ "tf": []
18
+ }
19
+ },
20
  "id2label": {
21
  "0": "O",
22
  "1": "B-PER",
deploy_pipeline.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import Pipeline
3
+ from transformers import AutoTokenizer
4
+ from transformers.pipelines import PIPELINE_REGISTRY
5
+ from transformers import pipeline
6
+ from transformers import AutoModelForTokenClassification
7
+ from huggingface_hub import Repository
8
+ import sys
9
+ import os
10
+
11
+
12
+ class TokenizeAndAlignLabelsStep():
13
+
14
+ # Adapted From : https://huggingface.co/docs/transformers/tasks/token_classification
15
+ def tokenize_and_align_labels(self, examples, tokenizer):
16
+
17
+ tokenized_inputs = tokenizer(examples, padding='max_length', truncation=True, max_length=512)
18
+
19
+ # Map tokens to their respective word.
20
+ word_ids = tokenized_inputs.word_ids()
21
+
22
+ previous_word_idx = None
23
+
24
+ labels_mask = []
25
+
26
+ for word_idx in word_ids: # Set the special tokens to -100.
27
+ if word_idx is None:
28
+ labels_mask.append(False)
29
+ # Only label the first token of a given word.
30
+ elif word_idx != previous_word_idx:
31
+ labels_mask.append(True)
32
+ else:
33
+ labels_mask.append(False)
34
+
35
+ previous_word_idx = word_idx
36
+
37
+ tokenized_inputs["tokens"] = examples
38
+ tokenized_inputs["labels_mask"] = labels_mask
39
+
40
+ return tokenized_inputs
41
+
42
+
43
+ class BERT_CRF_Pipeline(Pipeline):
44
+
45
+ def _sanitize_parameters(self, **kwargs):
46
+ return {}, {}, {}
47
+
48
+ def preprocess(self, text):
49
+
50
+ tokenizer = AutoTokenizer.from_pretrained(
51
+ "neuralmind/bert-base-portuguese-cased", do_lower_case=False)
52
+
53
+ TokenizeAndAlignLabelsStep().tokenize_and_align_labels(
54
+ examples=text, tokenizer=tokenizer)
55
+
56
+ return TokenizeAndAlignLabelsStep().tokenize_and_align_labels(examples=text, tokenizer=tokenizer)
57
+
58
+ def _forward(self, tokenizer_results):
59
+
60
+ input_ids = torch.tensor(
61
+ tokenizer_results['input_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
62
+
63
+ token_type_ids = torch.tensor(
64
+ tokenizer_results['token_type_ids'], dtype=torch.long, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
65
+
66
+ attention_mask = torch.tensor(
67
+ tokenizer_results['attention_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
68
+
69
+ labels_mask = torch.tensor(
70
+ tokenizer_results['labels_mask'], dtype=torch.bool, device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")).unsqueeze(0)
71
+
72
+ # input_ids, token_type_ids, attention_mask, labels, labels_mask
73
+ outputs = self.model(input_ids=input_ids, token_type_ids=token_type_ids,
74
+ attention_mask=attention_mask, labels=None, labels_mask=labels_mask)
75
+
76
+ return outputs
77
+
78
+ def postprocess(self, model_outputs):
79
+ # From Ner_tags to Ner_labels
80
+ for i, label in enumerate(model_outputs[0]):
81
+ model_outputs[0][i] = self.model.config.id2label[label]
82
+
83
+ return model_outputs[0]
84
+
85
+
86
+ def main():
87
+
88
+ PIPELINE_REGISTRY.register_pipeline("PT-BERT-Large-CRF-Conll2003-pipeline",
89
+ pipeline_class=BERT_CRF_Pipeline,
90
+ pt_model=AutoModelForTokenClassification,
91
+ )
92
+ classifier = pipeline("PT-BERT-Large-CRF-Conll2003-pipeline", model="arubenruben/PT-BERT-Large-CRF-Conll2003",
93
+ device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), trust_remote_code=True)
94
+ out_path = os.path.join(sys.path[0], 'out', 'pipeline')
95
+ repo = Repository(
96
+ out_path, clone_from=f"arubenruben/PT-BERT-Large-CRF-Conll2003", use_auth_token=True)
97
+
98
+ # repo.git_pull()
99
+
100
+ classifier.save_pretrained(out_path)
101
+ repo.push_to_hub()
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5996a9b88e08415a34eef073d5708a7b916cebc099aa379d3a4f1051cdcafb0e
3
- size 1337754151
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd911f1cc52f0e99fbab1fb1713b59794f717633a39e88b1f12ccf73b908db0
3
+ size 1337757121