Spaces:

Linhz
/

Vietnames-MNER

Paused

App Files Files Community

Linhz commited on Jun 9, 2024

Commit

c6538e0

verified ·

1 Parent(s): 548e229

Upload 4 files

Browse files

Files changed (4) hide show

Model/NER/VLSP2021/Load_model.py +34 -0
Model/NER/VLSP2021/Ner_CRF.py +144 -0
Model/NER/VLSP2021/Predict_Ner.py +210 -0
Model/NER/VLSP2021/best_model.pt +3 -0

Model/NER/VLSP2021/Load_model.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import RobertaConfig, AutoConfig
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+from Model.NER.VLSP2021.Ner_CRF import PhoBertCrf,PhoBertSoftmax,PhoBertLstmCrf
+from Model.NER.VLSP2021.Predict_Ner import ViTagger
+import torch
+from spacy import displacy
+import re
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+MODEL_MAPPING = {
+    'vinai/phobert-base': {
+        'softmax': PhoBertSoftmax,
+        'crf': PhoBertCrf,
+        'lstm_crf': PhoBertLstmCrf
+    },
+}
+if device == 'cpu':
+        checkpoint_data = torch.load('E:/demo_datn/pythonProject1/Model/NER/VLSP2021/best_model.pt', map_location='cpu')
+else:
+        checkpoint_data = torch.load('E:/demo_datn/pythonProject1/Model/NER/VLSP2021/best_model.pt')
+configs = checkpoint_data['args']
+print(configs.model_name_or_path)
+tokenizer = AutoTokenizer.from_pretrained(configs.model_name_or_path)
+model_clss = MODEL_MAPPING[configs.model_name_or_path][configs.model_arch]
+config = AutoConfig.from_pretrained(configs.model_name_or_path,
+                                        num_labels=len(checkpoint_data['classes']),
+                                        finetuning_task=configs.task)
+model = model_clss(config=config)
+model.resize_token_embeddings(len(tokenizer))
+model.to(device)
+model.load_state_dict(checkpoint_data['model'],strict=False)
+print(model)

Model/NER/VLSP2021/Ner_CRF.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from typing import Optional, List, Tuple, Any
+from collections import OrderedDict
+from transformers import logging, RobertaForTokenClassification
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+from torchcrf import CRF
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+logging.set_verbosity_error()
+import torch
+logging.set_verbosity_error()
+class NerOutput(OrderedDict):
+    loss: Optional[torch.FloatTensor] = torch.FloatTensor([0.0])
+    tags: Optional[List[int]] = []
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+    def __setitem__(self, key, value):
+        super().__setitem__(key, value)
+        super().__setattr__(key, value)
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(self[k] for k in self.keys())
+class PhoBertSoftmax(RobertaForTokenClassification):
+    def __init__(self, config, **kwargs):
+        super(PhoBertSoftmax, self).__init__(config=config, **kwargs)
+        self.num_labels = config.num_labels
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
+                label_masks=None):
+        seq_output = self.roberta(input_ids=input_ids,
+                                  token_type_ids=token_type_ids,
+                                  attention_mask=attention_mask,
+                                  head_mask=None)[0]
+        seq_output = self.dropout(seq_output)
+        logits = self.classifier(seq_output)
+        probs = F.log_softmax(logits, dim=2)
+        label_masks = label_masks.view(-1) != 0
+        seq_tags = torch.masked_select(torch.argmax(probs, dim=2).view(-1), label_masks).tolist()
+        if labels is not None:
+            loss_func = nn.CrossEntropyLoss()
+            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))
+            return NerOutput(loss=loss, tags=seq_tags)
+        else:
+            return NerOutput(tags=seq_tags)
+class PhoBertCrf(RobertaForTokenClassification):
+    def __init__(self, config):
+        super(PhoBertCrf, self).__init__(config=config)
+        self.num_labels = config.num_labels
+        self.crf = CRF(config.num_labels, batch_first=True)
+        self.init_weights()
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
+                label_masks=None):
+        seq_outputs = self.roberta(input_ids=input_ids,
+                                   token_type_ids=token_type_ids,
+                                   attention_mask=attention_mask,
+                                   head_mask=None)[0]
+        batch_size, max_len, feat_dim = seq_outputs.shape
+        range_vector = torch.arange(0, batch_size, dtype=torch.long, device=seq_outputs.device).unsqueeze(1)
+        seq_outputs = seq_outputs[range_vector, valid_ids]
+        seq_outputs = self.dropout(seq_outputs)
+        logits = self.classifier(seq_outputs)
+        seq_tags = self.crf.decode(logits, mask=label_masks != 0)
+        if labels is not None:
+            log_likelihood = self.crf(logits, labels, mask=label_masks.type(torch.uint8))
+            return NerOutput(loss=-1.0 * log_likelihood, tags=seq_tags)
+        else:
+            return NerOutput(tags=seq_tags)
+class PhoBertLstmCrf(RobertaForTokenClassification):
+    def __init__(self, config):
+        super(PhoBertLstmCrf, self).__init__(config=config)
+        self.num_labels = config.num_labels
+        self.lstm = nn.LSTM(input_size=config.hidden_size,
+                            hidden_size=config.hidden_size // 2,
+                            num_layers=1,
+                            batch_first=True,
+                            bidirectional=True)
+        self.crf = CRF(config.num_labels, batch_first=True)
+    @staticmethod
+    def sort_batch(src_tensor, lengths):
+        """
+        Sort a minibatch by the length of the sequences with the longest sequences first
+        return the sorted batch targes and sequence lengths.
+        This way the output can be used by pack_padded_sequences(...)
+        """
+        seq_lengths, perm_idx = lengths.sort(0, descending=True)
+        seq_tensor = src_tensor[perm_idx]
+        _, reversed_idx = perm_idx.sort(0, descending=False)
+        return seq_tensor, seq_lengths, reversed_idx
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, valid_ids=None,
+                label_masks=None):
+        seq_outputs = self.roberta(input_ids=input_ids,
+                                   token_type_ids=token_type_ids,
+                                   attention_mask=attention_mask,
+                                   head_mask=None)[0]
+        batch_size, max_len, feat_dim = seq_outputs.shape
+        seq_lens = torch.sum(label_masks, dim=-1)
+        range_vector = torch.arange(0, batch_size, dtype=torch.long, device=seq_outputs.device).unsqueeze(1)
+        seq_outputs = seq_outputs[range_vector, valid_ids]
+        sorted_seq_outputs, sorted_seq_lens, reversed_idx = self.sort_batch(src_tensor=seq_outputs,
+                                                                            lengths=seq_lens)
+        packed_words = pack_padded_sequence(sorted_seq_outputs, sorted_seq_lens.cpu(), True)
+        lstm_outs, _ = self.lstm(packed_words)
+        lstm_outs, _ = pad_packed_sequence(lstm_outs, batch_first=True, total_length=max_len)
+        seq_outputs = lstm_outs[reversed_idx]
+        seq_outputs = self.dropout(seq_outputs)
+        logits = self.classifier(seq_outputs)
+        seq_tags = self.crf.decode(logits, mask=label_masks != 0)
+        if labels is not None:
+            log_likelihood = self.crf(logits, labels, mask=label_masks.type(torch.uint8))
+            return NerOutput(loss=-1.0 * log_likelihood, tags=seq_tags)
+        else:
+            return NerOutput(tags=seq_tags)

Model/NER/VLSP2021/Predict_Ner.py ADDED Viewed

	@@ -0,0 +1,210 @@

+from vncorenlp import VnCoreNLP
+from typing import Union
+from transformers import AutoConfig, AutoTokenizer
+from Model.NER.VLSP2021.Ner_CRF import PhoBertCrf,PhoBertSoftmax,PhoBertLstmCrf
+import re
+import os
+import torch
+import itertools
+import numpy as np
+MODEL_MAPPING = {
+    'vinai/phobert-base': {
+        'softmax': PhoBertSoftmax,
+        'crf': PhoBertCrf,
+        'lstm_crf': PhoBertLstmCrf
+    },
+}
+def normalize_text(txt: str) -> str:
+    # Remove special character
+    txt = re.sub("\xad|\u200b|\ufeff", "", txt)
+    # Normalize vietnamese accents
+    txt = re.sub(r"òa", "oà", txt)
+    txt = re.sub(r"óa", "oá", txt)
+    txt = re.sub(r"ỏa", "oả", txt)
+    txt = re.sub(r"õa", "oã", txt)
+    txt = re.sub(r"ọa", "oạ", txt)
+    txt = re.sub(r"òe", "oè", txt)
+    txt = re.sub(r"óe", "oé", txt)
+    txt = re.sub(r"ỏe", "oẻ", txt)
+    txt = re.sub(r"õe", "oẽ", txt)
+    txt = re.sub(r"ọe", "oẹ", txt)
+    txt = re.sub(r"ùy", "uỳ", txt)
+    txt = re.sub(r"úy", "uý", txt)
+    txt = re.sub(r"ủy", "uỷ", txt)
+    txt = re.sub(r"ũy", "uỹ", txt)
+    txt = re.sub(r"ụy", "uỵ", txt)
+    txt = re.sub(r"Ủy", "Uỷ", txt)
+    txt = re.sub(r'"', '”', txt)
+    # Remove multi-space
+    txt = re.sub(" +", " ", txt)
+    return txt.strip()
+class ViTagger(object):
+    def __init__(self, model_path: Union[str or os.PathLike],  no_cuda=False):
+        self.device = 'cuda' if not no_cuda and torch.cuda.is_available() else 'cpu'
+        print("[ViTagger] VnCoreNLP loading ...")
+        self.rdrsegmenter = VnCoreNLP("E:/demo_datn/pythonProject1/VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
+        print("[ViTagger] Model loading ...")
+        self.model, self.tokenizer,  self.max_seq_len, self.label2id, self.use_crf = self.load_model(model_path, device=self.device)
+        self.id2label = {idx: label for idx, label in enumerate(self.label2id)}
+        print("[ViTagger] All ready!")
+    @staticmethod
+    def load_model(model_path: Union[str or os.PathLike],  device='cpu'):
+        if device == 'cpu':
+            checkpoint_data = torch.load(model_path, map_location='cpu')
+        else:
+            checkpoint_data = torch.load(model_path)
+        args = checkpoint_data["args"]
+        max_seq_len = args.max_seq_length
+        use_crf = True if 'crf' in args.model_arch else False
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=False)
+        config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=len(args.label2id))
+        model_clss = MODEL_MAPPING[args.model_name_or_path][args.model_arch]
+        model = model_clss(config=config)
+        model.load_state_dict(checkpoint_data['model'],strict=False)
+        model.to(device)
+        model.eval()
+        return model, tokenizer, max_seq_len, args.label2id, use_crf
+    def preprocess(self, in_raw: str):
+        norm_text = normalize_text(in_raw)
+        sents = []
+        sentences = self.rdrsegmenter.tokenize(norm_text)
+        for sentence in sentences:
+            sents.append(sentence)
+        return sents
+    def convert_tensor(self, tokens):
+        seq_len = len(tokens)
+        encoding = self.tokenizer(tokens,
+                                  padding='max_length',
+                                  truncation=True,
+                                  is_split_into_words=True,
+                                  max_length=self.max_seq_len)
+        if 'vinai/phobert' in self.tokenizer.name_or_path:
+            print(' '.join(tokens))
+            subwords = self.tokenizer.tokenize(' '.join(tokens))
+            valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
+            label_marks = np.zeros(len(encoding.input_ids), dtype=int)
+            i = 1
+            for idx, subword in enumerate(subwords[:self.max_seq_len - 2]):
+                if idx != 0 and subwords[idx - 1].endswith("@@"):
+                    continue
+                if self.use_crf:
+                    valid_ids[i - 1] = idx + 1
+                else:
+                    valid_ids[idx + 1] = 1
+                i += 1
+        else:
+            valid_ids = np.zeros(len(encoding.input_ids), dtype=int)
+            label_marks = np.zeros(len(encoding.input_ids), dtype=int)
+            i = 1
+            word_ids = encoding.word_ids()
+            for idx in range(1, len(word_ids)):
+                if word_ids[idx] is not None and word_ids[idx] != word_ids[idx - 1]:
+                    if self.use_crf:
+                        valid_ids[i - 1] = idx
+                    else:
+                        valid_ids[idx] = 1
+                    i += 1
+        if self.max_seq_len >= seq_len + 2:
+            label_marks[:seq_len] = [1] * seq_len
+        else:
+            label_marks[:-2] = [1] * (self.max_seq_len - 2)
+        if self.use_crf and label_marks[0] == 0:
+            raise f"{tokens} have mark == 0 at index 0!"
+        item = {key: torch.as_tensor([val]).to(self.device, dtype=torch.long) for key, val in encoding.items()}
+        item['valid_ids'] = torch.as_tensor([valid_ids]).to(self.device, dtype=torch.long)
+        item['label_masks'] = torch.as_tensor([valid_ids]).to(self.device, dtype=torch.long)
+        return item
+    def extract_entity_doc(self, in_raw: str):
+        sents = self.preprocess(in_raw)
+        print(sents)
+        entities_doc = []
+        for sent in sents:
+            item = self.convert_tensor(sent)
+            with torch.no_grad():
+                outputs = self.model(**item)
+            entity = None
+            if isinstance(outputs.tags[0], list):
+                tags = list(itertools.chain(*outputs.tags))
+            else:
+                tags = outputs.tags
+            for w, l in list(zip(sent, tags)):
+                w = w.replace("_", " ")
+                tag = self.id2label[l]
+                if not tag == 'O':
+                    parts = tag.split('-', 1)
+                    prefix = parts[0]
+                    tag = parts[1] if len(parts) > 1 else ""
+                    if entity is None:
+                        entity = (w, tag)
+                    else:
+                        if entity[-1] == tag:
+                            if prefix == 'I':
+                                entity = (entity[0] + f' {w}', tag)
+                            else:
+                                entities_doc.append(entity)
+                                entity = (w, tag)
+                        else:
+                            entities_doc.append(entity)
+                            entity = (w, tag)
+                elif entity is not None:
+                    entities_doc.append(entity)
+                    if w != ' ':
+                        entities_doc.append((w, 'O'))
+                    entity = None
+                elif w != ' ':
+                    entities_doc.append((w, 'O'))
+                    entity = None
+        return entities_doc
+    def __call__(self, in_raw: str):
+        sents = self.preprocess(in_raw)
+        entites = []
+        for sent in sents:
+            item = self.convert_tensor(sent)
+            with torch.no_grad():
+                outputs = self.model(**item)
+            entity = None
+            if isinstance(outputs.tags[0], list):
+                tags = list(itertools.chain(*outputs.tags))
+            else:
+                tags = outputs.tags
+            for w, l in list(zip(sent, tags)):
+                w = w.replace("_", " ")
+                tag = self.id2label[l]
+                if not tag == 'O':
+                    prefix, tag = tag.split('-')
+                    if entity is None:
+                        entity = (w, tag)
+                    else:
+                        if entity[-1] == tag:
+                            if prefix == 'I':
+                                entity = (entity[0] + f' {w}', tag)
+                            else:
+                                entites.append(entity)
+                                entity = (w, tag)
+                        else:
+                            entites.append(entity)
+                            entity = (w, tag)
+                elif entity is not None:
+                    entites.append(entity)
+                    entity = None
+                else:
+                    entity = None
+        return entites

Model/NER/VLSP2021/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ba2ccb63d96cedbc6149174536a295da540b04faefce5d48d6c0b9e248a199d
+size 538007497