Upload 8 files

Browse files

Files changed (8) hide show

config.json +37 -0
csc_model.py +410 -0
csc_tokenizer.py +126 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +21 -0
vocab.txt +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "ReaLiseForCSC"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoModel": "csc_model.ReaLiseForCSC"
+  },
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "image_model_type": 0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_fonts": 3,
+  "num_hidden_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.33.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128,
+  "vocab_size_or_config_json_file": 21128
+}

csc_model.py ADDED Viewed

	@@ -0,0 +1,410 @@

+import os
+from copy import deepcopy
+import numpy as np
+import opencc
+import pypinyin
+import torch
+from PIL import ImageFont
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import MaskedLMOutput
+from transformers import BertPreTrainedModel, BertModel
+def _is_chinese_char(cp):
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+        return True
+    return False
+class Pinyin2(object):
+    def __init__(self):
+        super(Pinyin2, self).__init__()
+        pho_vocab = ['P']
+        pho_vocab += [chr(x) for x in range(ord('1'), ord('5') + 1)]
+        pho_vocab += [chr(x) for x in range(ord('a'), ord('z') + 1)]
+        pho_vocab += ['U']
+        assert len(pho_vocab) == 33
+        self.pho_vocab_size = len(pho_vocab)
+        self.pho_vocab = {c: idx for idx, c in enumerate(pho_vocab)}
+    def get_pho_size(self):
+        return self.pho_vocab_size
+    @staticmethod
+    def get_pinyin(c):
+        if len(c) > 1:
+            return 'U'
+        s = pypinyin.pinyin(
+            c,
+            style=pypinyin.Style.TONE3,
+            neutral_tone_with_five=True,
+            errors=lambda x: ['U' for _ in x],
+        )[0][0]
+        if s == 'U':
+            return s
+        assert isinstance(s, str)
+        assert s[-1] in '12345'
+        s = s[-1] + s[:-1]
+        return s
+    def convert(self, chars):
+        pinyins = list(map(self.get_pinyin, chars))
+        pinyin_ids = [list(map(self.pho_vocab.get, pinyin)) for pinyin in pinyins]
+        pinyin_lens = [len(pinyin) for pinyin in pinyins]
+        pinyin_ids = torch.nn.utils.rnn.pad_sequence(
+            [torch.tensor(x) for x in pinyin_ids],
+            batch_first=True,
+            padding_value=0,
+        )
+        return pinyin_ids, pinyin_lens
+pho2_convertor = Pinyin2()
+class CharResNet(torch.nn.Module):
+    def __init__(self, in_channels=1):
+        super().__init__()
+        # input_image: bxcx32x32, output_image: bx768x1x1
+        self.res_block1 = BasicBlock(in_channels, 64, stride=2)  # channels: 64, size: 16x16
+        self.res_block2 = BasicBlock(64, 128, stride=2)  # channels: 128, size: 8x8
+        self.res_block3 = BasicBlock(128, 256, stride=2)  # channels: 256, size: 4x4
+        self.res_block4 = BasicBlock(256, 512, stride=2)  # channels: 512, size: 2x2
+        self.res_block5 = BasicBlock(512, 768, stride=2)  # channels: 768, size: 1x1
+    def forward(self, x):
+        # input_shape: bxcx32x32, output_image: bx768
+        # x = x.unsqueeze(1)
+        h = self.res_block1(x)
+        h = self.res_block2(h)
+        h = self.res_block3(h)
+        h = self.res_block4(h)
+        h = self.res_block5(h)
+        h = h.squeeze(-1).squeeze(-1)
+        return h
+class BasicBlock(nn.Module):
+    expansion = 1
+    def __init__(self, in_channels, out_channels, stride=1):
+        super().__init__()
+        self.residual_function = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_channels, out_channels * BasicBlock.expansion, kernel_size=3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels * BasicBlock.expansion)
+        )
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != BasicBlock.expansion * out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels * BasicBlock.expansion)
+            )
+    def forward(self, x):
+        return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x))
+class CharResNet1(torch.nn.Module):
+    def __init__(self, in_channels=1):
+        super().__init__()
+        self.res_block1 = BasicBlock(in_channels, 64, stride=2)  # channels: 64, size: 16x16
+        self.res_block2 = BasicBlock(64, 128, stride=2)  # channels:  128, size: 8x8
+        self.res_block3 = BasicBlock(128, 192, stride=2)  # channels: 256, size: 4x4
+        self.res_block4 = BasicBlock(192, 192, stride=2)
+    def forward(self, x):
+        # input_shape: bxcx32x32, output_shape: bx128x8x8
+        h = x
+        h = self.res_block1(h)
+        h = self.res_block2(h)
+        h = self.res_block3(h)
+        h = self.res_block4(h)
+        h = h.view(h.shape[0], -1)
+        return h
+class ReaLiseForCSC(BertPreTrainedModel):
+    def __init__(self, config):
+        super(ReaLiseForCSC, self).__init__(config)
+        self.config = config
+        self.vocab_size = config.vocab_size
+        self.bert = BertModel(config)
+        self.pho_embeddings = nn.Embedding(pho2_convertor.get_pho_size(), config.hidden_size, padding_idx=0)
+        self.pho_gru = nn.GRU(
+            input_size=config.hidden_size,
+            hidden_size=config.hidden_size,
+            num_layers=1,
+            batch_first=True,
+            dropout=0,
+            bidirectional=False,
+        )
+        pho_config = deepcopy(config)
+        pho_config.num_hidden_layers = 4
+        self.pho_model = BertModel(pho_config)
+        self.char_images_multifonts = torch.nn.Parameter(torch.rand(21128, 3, 32, 32))
+        self.char_images_multifonts.requires_grad = False
+        self.resnet = CharResNet(in_channels=3)
+        self.resnet_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.gate_net = nn.Linear(4 * config.hidden_size, 3)
+        out_config = deepcopy(config)
+        out_config.num_hidden_layers = 3
+        self.output_block = BertModel(out_config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.vocab_size)
+        self.init_weights()
+        self.loss_fnt = CrossEntropyLoss(ignore_index=0)
+        self.tokenizer = None
+    def tie_cls_weight(self):
+        self.classifier.weight = self.bert.embeddings.word_embeddings.weight
+    def build_glyce_embed(self, vocab_dir, font_path, font_size=32):
+        vocab_path = os.path.join(vocab_dir, 'vocab.txt')
+        with open(vocab_path, 'r', encoding='utf-8') as f:
+            vocab = [s.strip() for s in f]
+        font = ImageFont.truetype(font_path, size=font_size)
+        char_images = []
+        for char in vocab:
+            if len(char) != 1 or (not _is_chinese_char(ord(char))):
+                char_images.append(np.zeros((font_size, font_size)).astype(np.float32))
+                continue
+            image = font.getmask(char)
+            image = np.asarray(image).astype(np.float32).reshape(image.size[::-1])  # Must be [::-1]
+            # Crop
+            image = image[:font_size, :font_size]
+            # Pad
+            if image.size != (font_size, font_size):
+                back_image = np.zeros((font_size, font_size)).astype(np.float32)
+                offset0 = (font_size - image.shape[0]) // 2
+                offset1 = (font_size - image.shape[1]) // 2
+                back_image[offset0:offset0 + image.shape[0], offset1:offset1 + image.shape[1]] = image
+                image = back_image
+            char_images.append(image)
+        char_images = np.array(char_images)
+        char_images = (char_images - np.mean(char_images)) / np.std(char_images)
+        char_images = torch.from_numpy(char_images).reshape(char_images.shape[0], -1)
+        assert char_images.shape == (21128, 1024)
+        self.char_images.weight.data.copy_(char_images)
+    # Add by hengdaxu
+    def build_glyce_embed_multifonts(self, vocab_dir, num_fonts, use_traditional_font, font_size=32):
+        font_paths = [
+            ('simhei.ttf', False),
+            ('xiaozhuan.ttf', False),
+            ('simhei.ttf', True),
+        ]
+        font_paths = font_paths[:num_fonts]
+        if use_traditional_font:
+            font_paths = font_paths[:-1]
+            font_paths.append(('simhei.ttf', True))
+            self.converter = opencc.OpenCC('s2t.json')
+        images_list = []
+        for font_path, use_traditional in font_paths:
+            images = self.build_glyce_embed_onefont(
+                vocab_dir=vocab_dir,
+                font_path=font_path,
+                font_size=font_size,
+                use_traditional=use_traditional,
+            )
+            images_list.append(images)
+        char_images = torch.stack(images_list, dim=1).contiguous()
+        self.char_images_multifonts.data.copy_(char_images)
+    # Add by hengdaxu
+    def build_glyce_embed_onefont(self, vocab_dir, font_path, font_size, use_traditional):
+        vocab_path = os.path.join(vocab_dir, 'vocab.txt')
+        with open(vocab_path, encoding='utf-8') as f:
+            vocab = [s.strip() for s in f.readlines()]
+        if use_traditional:
+            vocab = [self.converter.convert(c) if len(c) == 1 else c for c in vocab]
+        font = ImageFont.truetype(font_path, size=font_size)
+        char_images = []
+        for char in vocab:
+            if len(char) > 1:
+                char_images.append(np.zeros((font_size, font_size)).astype(np.float32))
+                continue
+            image = font.getmask(char)
+            image = np.asarray(image).astype(np.float32).reshape(image.size[::-1])  # Must be [::-1]
+            # Crop
+            image = image[:font_size, :font_size]
+            # Pad
+            if image.size != (font_size, font_size):
+                back_image = np.zeros((font_size, font_size)).astype(np.float32)
+                offset0 = (font_size - image.shape[0]) // 2
+                offset1 = (font_size - image.shape[1]) // 2
+                back_image[offset0:offset0 + image.shape[0], offset1:offset1 + image.shape[1]] = image
+                image = back_image
+            char_images.append(image)
+        char_images = np.array(char_images)
+        char_images = (char_images - np.mean(char_images)) / np.std(char_images)
+        char_images = torch.from_numpy(char_images).contiguous()
+        return char_images
+    @staticmethod
+    def build_batch(batch, tokenizer):
+        src_idx = batch['src_idx'].flatten().tolist()
+        chars = tokenizer.convert_ids_to_tokens(src_idx)
+        pho_idx, pho_lens = pho2_convertor.convert(chars)
+        batch['pho_idx'] = pho_idx
+        batch['pho_lens'] = pho_lens
+        return batch
+    def forward(self,
+                input_ids=None,
+                pho_idx=None,
+                pho_lens=None,
+                attention_mask=None,
+                labels=None,
+                **kwargs):
+        input_shape = input_ids.size()
+        bert_hiddens = self.bert(input_ids, attention_mask=attention_mask)[0]
+        pho_embeddings = self.pho_embeddings(pho_idx)
+        pho_embeddings = torch.nn.utils.rnn.pack_padded_sequence(
+            input=pho_embeddings,
+            lengths=pho_lens,
+            batch_first=True,
+            enforce_sorted=False,
+        )
+        _, pho_hiddens = self.pho_gru(pho_embeddings)
+        pho_hiddens = pho_hiddens.squeeze(0).reshape(input_shape[0], input_shape[1], -1).contiguous()
+        pho_hiddens = self.pho_model(inputs_embeds=pho_hiddens, attention_mask=attention_mask)[0]
+        src_idxs = input_ids.view(-1)
+        if self.config.num_fonts == 1:
+            images = self.char_images(src_idxs).reshape(src_idxs.shape[0], 1, 32, 32).contiguous()
+        else:
+            images = self.char_images_multifonts.index_select(dim=0, index=src_idxs)
+        res_hiddens = self.resnet(images)
+        res_hiddens = res_hiddens.reshape(input_shape[0], input_shape[1], -1).contiguous()
+        res_hiddens = self.resnet_layernorm(res_hiddens)
+        bert_hiddens_mean = (bert_hiddens * attention_mask.to(torch.float).unsqueeze(2)).sum(dim=1) / attention_mask.to(
+            torch.float).sum(dim=1, keepdim=True)
+        bert_hiddens_mean = bert_hiddens_mean.unsqueeze(1).expand(-1, bert_hiddens.size(1), -1)
+        concated_outputs = torch.cat((bert_hiddens, pho_hiddens, res_hiddens, bert_hiddens_mean), dim=-1)
+        gated_values = self.gate_net(concated_outputs)
+        # B * S * 3
+        g0 = torch.sigmoid(gated_values[:, :, 0].unsqueeze(-1))
+        g1 = torch.sigmoid(gated_values[:, :, 1].unsqueeze(-1))
+        g2 = torch.sigmoid(gated_values[:, :, 2].unsqueeze(-1))
+        hiddens = g0 * bert_hiddens + g1 * pho_hiddens + g2 * res_hiddens
+        outputs = self.output_block(inputs_embeds=hiddens,
+                                    position_ids=torch.zeros(input_ids.size(), dtype=torch.long,
+                                                             device=input_ids.device),
+                                    attention_mask=attention_mask)
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs = MaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.last_hidden_state,
+        )
+        if labels is not None:
+            # Only keep active parts of the loss
+            labels[labels == 101] = 0
+            labels[labels == 102] = 0
+            loss = self.loss_fnt(logits.view(-1, logits.size(-1)), labels.view(-1))
+            outputs.loss = loss
+        return outputs
+    def set_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+    def predict(self, sentences):
+        if self.tokenizer is None:
+            raise RuntimeError("Please init tokenizer by `set_tokenizer(tokenizer)` before predict.")
+        str_flag = False
+        if type(sentences) == str:
+            sentences = [sentences]
+            str_flag = True
+        inputs = self.tokenizer(sentences, padding=True, return_tensors="pt")
+        outputs = self.forward(**inputs).logits
+        ids_list = outputs.argmax(-1)
+        preds = []
+        for i, ids in enumerate(ids_list):
+            ids = ids[inputs['attention_mask'][i].bool()]
+            pred_tokens = self.tokenizer.convert_ids_to_tokens(ids)
+            pred_tokens = [t if not t.startswith('##') else t[2:] for t in pred_tokens]
+            pred_tokens = [t if t != self.tokenizer.unk_token else '×' for t in pred_tokens]
+            offsets = inputs[i].offsets
+            src_tokens = list(sentences[i])
+            for (start, end), pred_token in zip(offsets, pred_tokens):
+                if end - start <= 0:
+                    continue
+                if (end - start) != len(pred_token):
+                    continue
+                if pred_token == '×':
+                    continue
+                if (end - start) == 1 and not _is_chinese_char(ord(src_tokens[start])):
+                    continue
+                src_tokens[start:end] = pred_token
+            pred = ''.join(src_tokens)
+            preds.append(pred)
+        if str_flag:
+            return preds[0]
+        return preds

csc_tokenizer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+from typing import List, Union, Optional
+import pypinyin
+import torch
+from torch import NoneType
+from transformers import BertTokenizerFast
+class Pinyin2(object):
+    def __init__(self):
+        super(Pinyin2, self).__init__()
+        pho_vocab = ['P']
+        pho_vocab += [chr(x) for x in range(ord('1'), ord('5') + 1)]
+        pho_vocab += [chr(x) for x in range(ord('a'), ord('z') + 1)]
+        pho_vocab += ['U']
+        assert len(pho_vocab) == 33
+        self.pho_vocab_size = len(pho_vocab)
+        self.pho_vocab = {c: idx for idx, c in enumerate(pho_vocab)}
+    def get_pho_size(self):
+        return self.pho_vocab_size
+    @staticmethod
+    def get_pinyin(c):
+        if len(c) > 1:
+            return 'U'
+        s = pypinyin.pinyin(
+            c,
+            style=pypinyin.Style.TONE3,
+            neutral_tone_with_five=True,
+            errors=lambda x: ['U' for _ in x],
+        )[0][0]
+        if s == 'U':
+            return s
+        assert isinstance(s, str)
+        assert s[-1] in '12345'
+        s = s[-1] + s[:-1]
+        return s
+    def convert(self, chars):
+        pinyins = list(map(self.get_pinyin, chars))
+        pinyin_ids = [list(map(self.pho_vocab.get, pinyin)) for pinyin in pinyins]
+        pinyin_lens = [len(pinyin) for pinyin in pinyins]
+        pinyin_ids = torch.nn.utils.rnn.pad_sequence(
+            [torch.tensor(x) for x in pinyin_ids],
+            batch_first=True,
+            padding_value=0,
+        )
+        return pinyin_ids, pinyin_lens
+class ReaLiSeTokenizer(BertTokenizerFast):
+    def __init__(self, **kwargs):
+        super(ReaLiSeTokenizer, self).__init__(**kwargs)
+        self.pho2_convertor = Pinyin2()
+    def __call__(self,
+                 text: Union[str, List[str], List[List[str]]] = None,
+                 text_pair: Union[str, List[str], List[List[str]], NoneType] = None,
+                 text_target: Union[str, List[str], List[List[str]]] = None,
+                 text_pair_target: Union[str, List[str], List[List[str]], NoneType] = None,
+                 add_special_tokens: bool = True,
+                 padding=False,
+                 truncation=None,
+                 max_length: Optional[int] = None,
+                 stride: int = 0,
+                 is_split_into_words: bool = False,
+                 pad_to_multiple_of: Optional[int] = None,
+                 return_tensors=None,
+                 return_token_type_ids: Optional[bool] = None,
+                 return_attention_mask: Optional[bool] = None,
+                 return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False,
+                 return_offsets_mapping: bool = False,
+                 return_length: bool = False,
+                 verbose: bool = True, **kwargs):
+        encoding = super(ReaLiSeTokenizer, self).__call__(
+            text=text,
+            text_pair=text_pair,
+            text_target=text_target,
+            text_pair_target=text_pair_target,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+        )
+        input_ids = encoding['input_ids']
+        if type(text) == str and return_tensors is None:
+            input_ids = [input_ids]
+        pho_idx_list = []
+        pho_lens_list = []
+        for ids in input_ids:
+            chars = self.convert_ids_to_tokens(ids)
+            pho_idx, pho_lens = self.pho2_convertor.convert(chars)
+            if return_tensors is None:
+                pho_idx = pho_idx.tolist()
+            pho_idx_list.append(pho_idx)
+            pho_lens_list += pho_lens
+        pho_idx = pho_idx_list
+        pho_lens = pho_lens_list
+        if return_tensors == 'pt':
+            pho_idx = torch.vstack(pho_idx)
+            pho_lens = torch.LongTensor(pho_lens)
+        if type(text) == str and return_tensors is None:
+            pho_idx = pho_idx[0]
+        encoding['pho_idx'] = pho_idx
+        encoding['pho_lens'] = pho_lens
+        return encoding

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2c1d619e580891dffbe99d430cc116a82389433ab2a7ed15f8b43088135cf8d
+size 1140770411

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "csc_tokenizer.ReaLiSeTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "ReaLiSeTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff