Linhz commited on
Commit
c6538e0
·
verified ·
1 Parent(s): 548e229

Upload 4 files

Browse files
Model/NER/VLSP2021/Load_model.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import RobertaConfig, AutoConfig
2
+ from transformers import AutoTokenizer, AutoModelForTokenClassification
3
+ from Model.NER.VLSP2021.Ner_CRF import PhoBertCrf,PhoBertSoftmax,PhoBertLstmCrf
4
+ from Model.NER.VLSP2021.Predict_Ner import ViTagger
5
+ import torch
6
+ from spacy import displacy
7
+ import re
8
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
+ MODEL_MAPPING = {
10
+ 'vinai/phobert-base': {
11
+ 'softmax': PhoBertSoftmax,
12
+ 'crf': PhoBertCrf,
13
+ 'lstm_crf': PhoBertLstmCrf
14
+ },
15
+ }
16
+ if device == 'cpu':
17
+ checkpoint_data = torch.load('E:/demo_datn/pythonProject1/Model/NER/VLSP2021/best_model.pt', map_location='cpu')
18
+ else:
19
+ checkpoint_data = torch.load('E:/demo_datn/pythonProject1/Model/NER/VLSP2021/best_model.pt')
20
+
21
+ configs = checkpoint_data['args']
22
+ print(configs.model_name_or_path)
23
+ tokenizer = AutoTokenizer.from_pretrained(configs.model_name_or_path)
24
+ model_clss = MODEL_MAPPING[configs.model_name_or_path][configs.model_arch]
25
+ config = AutoConfig.from_pretrained(configs.model_name_or_path,
26
+ num_labels=len(checkpoint_data['classes']),
27
+ finetuning_task=configs.task)
28
+ model = model_clss(config=config)
29
+ model.resize_token_embeddings(len(tokenizer))
30
+ model.to(device)
31
+ model.load_state_dict(checkpoint_data['model'],strict=False)
32
+ print(model)
33
+
34
+
Model/NER/VLSP2021/Ner_CRF.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, List, Tuple, Any
2
+ from collections import OrderedDict
3
+ from transformers import logging, RobertaForTokenClassification
4
+ from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
5
+ from torchcrf import CRF
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+ logging.set_verbosity_error()
12
+
13
+ import torch
14
+
15
+ logging.set_verbosity_error()
16
+
17
+
18
+ class NerOutput(OrderedDict):
19
+ loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
20
+ tags: Optional[List[int]] = []
21
+
22
+ def __getitem__(self, k):
23
+ if isinstance(k, str):
24
+ inner_dict = {k: v for (k, v) in self.items()}
25
+ return inner_dict[k]
26
+ else:
27
+ return self.to_tuple()[k]
28
+
29
+ def __setattr__(self, name, value):
30
+ if name in self.keys() and value is not None:
31
+ super().__setitem__(name, value)
32
+ super().__setattr__(name, value)
33
+
34
+ def __setitem__(self, key, value):
35
+ super().__setitem__(key, value)
36
+ super().__setattr__(key, value)
37
+
38
+ def to_tuple(self) -> Tuple[Any]:
39
+ return tuple(self[k] for k in self.keys())
40
+
41
+
42
+
43
+ class PhoBertSoftmax(RobertaForTokenClassification):
44
+ def __init__(self, config, **kwargs):
45
+ super(PhoBertSoftmax, self).__init__(config=config, **kwargs)
46
+ self.num_labels = config.num_labels
47
+
48
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
49
+ label_masks=None):
50
+ seq_output = self.roberta(input_ids=input_ids,
51
+ token_type_ids=token_type_ids,
52
+ attention_mask=attention_mask,
53
+ head_mask=None)[0]
54
+ seq_output = self.dropout(seq_output)
55
+ logits = self.classifier(seq_output)
56
+ probs = F.log_softmax(logits, dim=2)
57
+ label_masks = label_masks.view(-1) != 0
58
+ seq_tags = torch.masked_select(torch.argmax(probs, dim=2).view(-1), label_masks).tolist()
59
+ if labels is not None:
60
+ loss_func = nn.CrossEntropyLoss()
61
+ loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))
62
+ return NerOutput(loss=loss, tags=seq_tags)
63
+ else:
64
+ return NerOutput(tags=seq_tags)
65
+
66
+
67
+ class PhoBertCrf(RobertaForTokenClassification):
68
+ def __init__(self, config):
69
+ super(PhoBertCrf, self).__init__(config=config)
70
+ self.num_labels = config.num_labels
71
+ self.crf = CRF(config.num_labels, batch_first=True)
72
+ self.init_weights()
73
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
74
+ label_masks=None):
75
+ seq_outputs = self.roberta(input_ids=input_ids,
76
+ token_type_ids=token_type_ids,
77
+ attention_mask=attention_mask,
78
+ head_mask=None)[0]
79
+
80
+ batch_size, max_len, feat_dim = seq_outputs.shape
81
+ range_vector = torch.arange(0, batch_size, dtype=torch.long, device=seq_outputs.device).unsqueeze(1)
82
+ seq_outputs = seq_outputs[range_vector, valid_ids]
83
+ seq_outputs = self.dropout(seq_outputs)
84
+ logits = self.classifier(seq_outputs)
85
+ seq_tags = self.crf.decode(logits, mask=label_masks != 0)
86
+
87
+ if labels is not None:
88
+ log_likelihood = self.crf(logits, labels, mask=label_masks.type(torch.uint8))
89
+ return NerOutput(loss=-1.0 * log_likelihood, tags=seq_tags)
90
+ else:
91
+ return NerOutput(tags=seq_tags)
92
+
93
+
94
+ class PhoBertLstmCrf(RobertaForTokenClassification):
95
+ def __init__(self, config):
96
+ super(PhoBertLstmCrf, self).__init__(config=config)
97
+ self.num_labels = config.num_labels
98
+ self.lstm = nn.LSTM(input_size=config.hidden_size,
99
+ hidden_size=config.hidden_size // 2,
100
+ num_layers=1,
101
+ batch_first=True,
102
+ bidirectional=True)
103
+ self.crf = CRF(config.num_labels, batch_first=True)
104
+
105
+ @staticmethod
106
+ def sort_batch(src_tensor, lengths):
107
+ """
108
+ Sort a minibatch by the length of the sequences with the longest sequences first
109
+ return the sorted batch targes and sequence lengths.
110
+ This way the output can be used by pack_padded_sequences(...)
111
+ """
112
+ seq_lengths, perm_idx = lengths.sort(0, descending=True)
113
+ seq_tensor = src_tensor[perm_idx]
114
+ _, reversed_idx = perm_idx.sort(0, descending=False)
115
+ return seq_tensor, seq_lengths, reversed_idx
116
+
117
+ def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
118
+ label_masks=None):
119
+ seq_outputs = self.roberta(input_ids=input_ids,
120
+ token_type_ids=token_type_ids,
121
+ attention_mask=attention_mask,
122
+ head_mask=None)[0]
123
+
124
+ batch_size, max_len, feat_dim = seq_outputs.shape
125
+ seq_lens = torch.sum(label_masks, dim=-1)
126
+ range_vector = torch.arange(0, batch_size, dtype=torch.long, device=seq_outputs.device).unsqueeze(1)
127
+ seq_outputs = seq_outputs[range_vector, valid_ids]
128
+
129
+ sorted_seq_outputs, sorted_seq_lens, reversed_idx = self.sort_batch(src_tensor=seq_outputs,
130
+ lengths=seq_lens)
131
+ packed_words = pack_padded_sequence(sorted_seq_outputs, sorted_seq_lens.cpu(), True)
132
+ lstm_outs, _ = self.lstm(packed_words)
133
+ lstm_outs, _ = pad_packed_sequence(lstm_outs, batch_first=True, total_length=max_len)
134
+ seq_outputs = lstm_outs[reversed_idx]
135
+
136
+ seq_outputs = self.dropout(seq_outputs)
137
+ logits = self.classifier(seq_outputs)
138
+ seq_tags = self.crf.decode(logits, mask=label_masks != 0)
139
+
140
+ if labels is not None:
141
+ log_likelihood = self.crf(logits, labels, mask=label_masks.type(torch.uint8))
142
+ return NerOutput(loss=-1.0 * log_likelihood, tags=seq_tags)
143
+ else:
144
+ return NerOutput(tags=seq_tags)
Model/NER/VLSP2021/Predict_Ner.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from vncorenlp import VnCoreNLP
3
+
4
+ from typing import Union
5
+ from transformers import AutoConfig, AutoTokenizer
6
+ from Model.NER.VLSP2021.Ner_CRF import PhoBertCrf,PhoBertSoftmax,PhoBertLstmCrf
7
+ import re
8
+ import os
9
+ import torch
10
+ import itertools
11
+ import numpy as np
12
+
13
+ MODEL_MAPPING = {
14
+ 'vinai/phobert-base': {
15
+ 'softmax': PhoBertSoftmax,
16
+ 'crf': PhoBertCrf,
17
+ 'lstm_crf': PhoBertLstmCrf
18
+ },
19
+ }
20
+
21
+
22
+ def normalize_text(txt: str) -> str:
23
+ # Remove special character
24
+ txt = re.sub("\xad|\u200b|\ufeff", "", txt)
25
+ # Normalize vietnamese accents
26
+ txt = re.sub(r"òa", "oà", txt)
27
+ txt = re.sub(r"óa", "oá", txt)
28
+ txt = re.sub(r"ỏa", "oả", txt)
29
+ txt = re.sub(r"õa", "oã", txt)
30
+ txt = re.sub(r"ọa", "oạ", txt)
31
+ txt = re.sub(r"òe", "oè", txt)
32
+ txt = re.sub(r"óe", "oé", txt)
33
+ txt = re.sub(r"ỏe", "oẻ", txt)
34
+ txt = re.sub(r"õe", "oẽ", txt)
35
+ txt = re.sub(r"ọe", "oẹ", txt)
36
+ txt = re.sub(r"ùy", "uỳ", txt)
37
+ txt = re.sub(r"úy", "uý", txt)
38
+ txt = re.sub(r"ủy", "uỷ", txt)
39
+ txt = re.sub(r"ũy", "uỹ", txt)
40
+ txt = re.sub(r"ụy", "uỵ", txt)
41
+ txt = re.sub(r"Ủy", "Uỷ", txt)
42
+
43
+ txt = re.sub(r'"', '”', txt)
44
+
45
+ # Remove multi-space
46
+ txt = re.sub(" +", " ", txt)
47
+ return txt.strip()
48
+ class ViTagger(object):
49
+ def __init__(self, model_path: Union[str or os.PathLike], no_cuda=False):
50
+ self.device = 'cuda' if not no_cuda and torch.cuda.is_available() else 'cpu'
51
+ print("[ViTagger] VnCoreNLP loading ...")
52
+ self.rdrsegmenter = VnCoreNLP("E:/demo_datn/pythonProject1/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
53
+ print("[ViTagger] Model loading ...")
54
+ self.model, self.tokenizer, self.max_seq_len, self.label2id, self.use_crf = self.load_model(model_path, device=self.device)
55
+ self.id2label = {idx: label for idx, label in enumerate(self.label2id)}
56
+ print("[ViTagger] All ready!")
57
+
58
+ @staticmethod
59
+ def load_model(model_path: Union[str or os.PathLike], device='cpu'):
60
+ if device == 'cpu':
61
+ checkpoint_data = torch.load(model_path, map_location='cpu')
62
+ else:
63
+ checkpoint_data = torch.load(model_path)
64
+ args = checkpoint_data["args"]
65
+ max_seq_len = args.max_seq_length
66
+ use_crf = True if 'crf' in args.model_arch else False
67
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
68
+ config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=len(args.label2id))
69
+ model_clss = MODEL_MAPPING[args.model_name_or_path][args.model_arch]
70
+ model = model_clss(config=config)
71
+ model.load_state_dict(checkpoint_data['model'],strict=False)
72
+ model.to(device)
73
+ model.eval()
74
+
75
+ return model, tokenizer, max_seq_len, args.label2id, use_crf
76
+
77
+ def preprocess(self, in_raw: str):
78
+ norm_text = normalize_text(in_raw)
79
+ sents = []
80
+ sentences = self.rdrsegmenter.tokenize(norm_text)
81
+ for sentence in sentences:
82
+ sents.append(sentence)
83
+ return sents
84
+
85
+ def convert_tensor(self, tokens):
86
+ seq_len = len(tokens)
87
+ encoding = self.tokenizer(tokens,
88
+ padding='max_length',
89
+ truncation=True,
90
+ is_split_into_words=True,
91
+ max_length=self.max_seq_len)
92
+ if 'vinai/phobert' in self.tokenizer.name_or_path:
93
+ print(' '.join(tokens))
94
+ subwords = self.tokenizer.tokenize(' '.join(tokens))
95
+ valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
96
+ label_marks = np.zeros(len(encoding.input_ids), dtype=int)
97
+ i = 1
98
+ for idx, subword in enumerate(subwords[:self.max_seq_len - 2]):
99
+ if idx != 0 and subwords[idx - 1].endswith("@@"):
100
+ continue
101
+ if self.use_crf:
102
+ valid_ids[i - 1] = idx + 1
103
+ else:
104
+ valid_ids[idx + 1] = 1
105
+ i += 1
106
+ else:
107
+ valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
108
+ label_marks = np.zeros(len(encoding.input_ids), dtype=int)
109
+ i = 1
110
+ word_ids = encoding.word_ids()
111
+ for idx in range(1, len(word_ids)):
112
+ if word_ids[idx] is not None and word_ids[idx] != word_ids[idx - 1]:
113
+ if self.use_crf:
114
+ valid_ids[i - 1] = idx
115
+ else:
116
+ valid_ids[idx] = 1
117
+ i += 1
118
+ if self.max_seq_len >= seq_len + 2:
119
+ label_marks[:seq_len] = [1] * seq_len
120
+ else:
121
+ label_marks[:-2] = [1] * (self.max_seq_len - 2)
122
+ if self.use_crf and label_marks[0] == 0:
123
+ raise f"{tokens} have mark == 0 at index 0!"
124
+ item = {key: torch.as_tensor([val]).to(self.device, dtype=torch.long) for key, val in encoding.items()}
125
+ item['valid_ids'] = torch.as_tensor([valid_ids]).to(self.device, dtype=torch.long)
126
+ item['label_masks'] = torch.as_tensor([valid_ids]).to(self.device, dtype=torch.long)
127
+ return item
128
+
129
+ def extract_entity_doc(self, in_raw: str):
130
+ sents = self.preprocess(in_raw)
131
+ print(sents)
132
+ entities_doc = []
133
+ for sent in sents:
134
+ item = self.convert_tensor(sent)
135
+ with torch.no_grad():
136
+ outputs = self.model(**item)
137
+ entity = None
138
+ if isinstance(outputs.tags[0], list):
139
+ tags = list(itertools.chain(*outputs.tags))
140
+ else:
141
+ tags = outputs.tags
142
+ for w, l in list(zip(sent, tags)):
143
+ w = w.replace("_", " ")
144
+ tag = self.id2label[l]
145
+ if not tag == 'O':
146
+ parts = tag.split('-', 1)
147
+ prefix = parts[0]
148
+ tag = parts[1] if len(parts) > 1 else ""
149
+ if entity is None:
150
+ entity = (w, tag)
151
+ else:
152
+ if entity[-1] == tag:
153
+ if prefix == 'I':
154
+ entity = (entity[0] + f' {w}', tag)
155
+ else:
156
+ entities_doc.append(entity)
157
+ entity = (w, tag)
158
+ else:
159
+ entities_doc.append(entity)
160
+ entity = (w, tag)
161
+ elif entity is not None:
162
+ entities_doc.append(entity)
163
+ if w != ' ':
164
+ entities_doc.append((w, 'O'))
165
+ entity = None
166
+ elif w != ' ':
167
+ entities_doc.append((w, 'O'))
168
+ entity = None
169
+ return entities_doc
170
+
171
+
172
+ def __call__(self, in_raw: str):
173
+ sents = self.preprocess(in_raw)
174
+ entites = []
175
+ for sent in sents:
176
+ item = self.convert_tensor(sent)
177
+ with torch.no_grad():
178
+ outputs = self.model(**item)
179
+ entity = None
180
+ if isinstance(outputs.tags[0], list):
181
+ tags = list(itertools.chain(*outputs.tags))
182
+ else:
183
+ tags = outputs.tags
184
+ for w, l in list(zip(sent, tags)):
185
+ w = w.replace("_", " ")
186
+ tag = self.id2label[l]
187
+ if not tag == 'O':
188
+ prefix, tag = tag.split('-')
189
+ if entity is None:
190
+ entity = (w, tag)
191
+ else:
192
+ if entity[-1] == tag:
193
+ if prefix == 'I':
194
+ entity = (entity[0] + f' {w}', tag)
195
+ else:
196
+ entites.append(entity)
197
+ entity = (w, tag)
198
+ else:
199
+ entites.append(entity)
200
+ entity = (w, tag)
201
+ elif entity is not None:
202
+ entites.append(entity)
203
+ entity = None
204
+ else:
205
+ entity = None
206
+ return entites
207
+
208
+
209
+
210
+
Model/NER/VLSP2021/best_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba2ccb63d96cedbc6149174536a295da540b04faefce5d48d6c0b9e248a199d
3
+ size 538007497