|
import os |
|
import json |
|
import unicodedata |
|
from typing import Any, Dict, List, Optional, Tuple, Union |
|
from collections.abc import Mapping |
|
from collections import Counter |
|
import itertools |
|
import torch |
|
|
|
from transformers.tokenization_utils import PreTrainedTokenizer, PaddingStrategy, TruncationStrategy, TensorType, BatchEncoding |
|
from transformers.utils import logging, is_torch_tensor |
|
|
|
TextInput = str |
|
PreTokenizedInput = List[str] |
|
EncodedInput = List[List[int]] |
|
TextInputPair = Tuple[TextInput, TextInput] |
|
PreTokenizedInputPair = Tuple[PreTokenizedInput, PreTokenizedInput] |
|
EncodedInputPair = Tuple[EncodedInput, EncodedInput] |
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"} |
|
|
|
|
|
|
|
class HLMTokenizer(PreTrainedTokenizer): |
|
r""" |
|
Constructs a HLM tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). |
|
|
|
Args: |
|
vocab_file (`str`): |
|
Path to .json vocab file. |
|
bos_token (`string`, *optional*, defaults to `"[CLS]"`): |
|
The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token. |
|
When building a sequence using special tokens, this is not the token that is used for the beginning of |
|
sequence. The token used is the `cls_token`. |
|
eos_token (`string`, *optional*, defaults to `"[SEP]"`): |
|
The end of sequence token. When building a sequence using special tokens, this is not the token that is |
|
used for the end of sequence. The token used is the `sep_token`. |
|
unk_token (`str`, *optional*, defaults to `"[UNK]"`): |
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this |
|
token instead. |
|
sep_token (`str`, *optional*, defaults to `"[SEP]"`): |
|
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for |
|
sequence classification or for a text and a question for question answering. It is also used as the last |
|
token of a sequence built with special tokens. |
|
pad_token (`str`, *optional*, defaults to `"[PAD]"`): |
|
The token used for padding, for example when batching sequences of different lengths. |
|
cls_token (`str`, *optional*, defaults to `"[CLS]"`): |
|
The classifier token which is used when doing sequence classification (classification of the whole sequence |
|
instead of per-token classification). It is the first token of the sequence when built with special tokens. |
|
mask_token (`str`, *optional*, defaults to `"[MASK]"`): |
|
The token used for masking values. This is the token used when training this model with masked language |
|
modeling. This is the token which the model will try to predict. |
|
word_cls_token (`str`, *optional*, defaults to `"[WORD_CLS]"`): |
|
The classifier token which is used for word representations and word classification. |
|
It is the first token of each word when built with special tokens. |
|
""" |
|
|
|
vocab_files_names = VOCAB_FILES_NAMES |
|
model_input_names: List[str] = ["input_ids", "char_input_mask", "word_input_mask", "word_type_ids"] |
|
padding_side: str = "right" |
|
truncation_side: str = "right" |
|
|
|
def __init__( |
|
self, |
|
vocab_file, |
|
split_by_punct=False, |
|
bos_token="[CLS]", |
|
eos_token="[SEP]", |
|
unk_token="[UNK]", |
|
sep_token="[SEP]", |
|
pad_token="[PAD]", |
|
cls_token="[CLS]", |
|
mask_token="[MASK]", |
|
word_cls_token="[WORD_CLS]", |
|
max_word_length=None, |
|
model_max_length=None, |
|
**kwargs, |
|
) -> None: |
|
if not os.path.isfile(vocab_file): |
|
raise ValueError( |
|
f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a pretrained" |
|
" model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" |
|
) |
|
|
|
if max_word_length is not None: |
|
self.max_word_length = max_word_length |
|
else: |
|
try: |
|
with open(os.path.dirname(vocab_file) + "/config.json", "r") as f: |
|
config = json.load(f) |
|
self.max_word_length = config["max_word_length"] |
|
if model_max_length is None: |
|
model_max_length = config.get("max_seq_length", None) |
|
except: |
|
raise ValueError("Failed to load max_word_length from config.json. Please specify max_word_length.") |
|
|
|
self.split_by_punct = split_by_punct |
|
self.vocab_file = vocab_file |
|
with open(vocab_file, 'r', encoding='utf-8') as f: |
|
vocab_data = json.load(f) |
|
self.vocab = vocab_data["vocab"] |
|
self.inv_vocab = {v: k for k, v in self.vocab.items()} |
|
|
|
super().__init__( |
|
bos_token=bos_token, |
|
eos_token=eos_token, |
|
unk_token=unk_token, |
|
sep_token=sep_token, |
|
pad_token=pad_token, |
|
cls_token=cls_token, |
|
mask_token=mask_token, |
|
split_by_punct=split_by_punct, |
|
model_max_length=model_max_length, |
|
**kwargs, |
|
) |
|
self.unk_id = self.vocab["[UNK]"] |
|
self.word_cls_token = word_cls_token |
|
self.word_cls_token_id = self._convert_token_to_id(word_cls_token) |
|
self.label_pad_token_id = -100 |
|
self.special_ids = [self._convert_token_to_id(token) for token in vocab_data["special_tokens"]] |
|
|
|
|
|
|
|
self.pad_word = [[0] + [0]*(self.max_word_length-1)] |
|
self.pad_mask_word = [[0] + [0]*(self.max_word_length-1)] |
|
|
|
@staticmethod |
|
def train(files: List[Union[str, os.PathLike]], output_dir: Union[str, os.PathLike], vocab_size: int=512, max_lines_to_consider=2_000_000): |
|
char_maps = [] |
|
|
|
|
|
for file in files: |
|
print('Loading char counts from', file) |
|
counter = Counter() |
|
line_count = 0 |
|
with open(file, "r", encoding="utf-8") as file: |
|
while line_count < max_lines_to_consider: |
|
lines = file.readlines(100*1024) |
|
if len(lines) == 0: |
|
break |
|
for line in lines: |
|
line = unicodedata.normalize('NFKC', line) |
|
line_count += 1 |
|
counter.update(line) |
|
d = {} |
|
total = counter.total() |
|
for char, count in counter.items(): |
|
d[char] = count / total |
|
char_maps.append(d) |
|
|
|
char_map = {} |
|
for d in char_maps: |
|
for char, freq in d.items(): |
|
if not char.isspace(): |
|
char_map[char] = char_map.get(char, 0) + freq |
|
|
|
special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '[WORD_CLS]'] |
|
chars_to_keep = sorted(list(char_map.keys()), key=lambda c: char_map[c], reverse=True)[:vocab_size-len(special_tokens)] |
|
vocab_entries = [*special_tokens, *chars_to_keep] |
|
|
|
vocab = { |
|
'special_tokens': special_tokens, |
|
'vocab': { key: i for i, key in enumerate(vocab_entries) } |
|
} |
|
|
|
assert(len(vocab_entries) == vocab_size) |
|
|
|
filename = os.path.join(output_dir, VOCAB_FILES_NAMES["vocab_file"]) |
|
os.makedirs(output_dir, exist_ok=True) |
|
print("Saving vocab to", filename) |
|
with open(filename, 'w', encoding='utf-8') as f: |
|
json.dump(vocab, f, ensure_ascii=False, indent=4) |
|
|
|
return filename |
|
|
|
@property |
|
def vocab_size(self): |
|
return len(self.vocab) |
|
|
|
def get_vocab(self): |
|
return self.vocab |
|
|
|
def _convert_token_to_id(self, token): |
|
"""Converts a token (str) to an id using the vocab.""" |
|
return self.vocab.get(token, self.unk_id) |
|
|
|
def _convert_id_to_token(self, index): |
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
return self.inv_vocab[index] if index < self.vocab_size else self.unk_token |
|
|
|
def convert_tokens_to_ids(self, tokens: Union[str, List[str], List[List[str]]]): |
|
if isinstance(tokens, str): |
|
return self._convert_token_to_id(tokens) |
|
if len(tokens) > 0 and isinstance(tokens[0], str): |
|
return [self._convert_token_to_id(token) for token in tokens] |
|
return [[self._convert_token_to_id(token) for token in word] for word in tokens] |
|
|
|
def convert_tokens_to_string(self, tokens): |
|
"""Converts a sequence of tokens (string) in a single string.""" |
|
raise NotImplementedError |
|
|
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): |
|
if token_ids_1 is None: |
|
return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id]] |
|
return [[self.cls_token_id]] + token_ids_0 + [[self.eos_token_id], [self.cls_token_id]] + token_ids_1 + [[self.eos_token_id]] |
|
|
|
def num_special_tokens_to_add(self, pair: bool = False) -> int: |
|
return 3 if pair else 2 |
|
|
|
def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): |
|
raise NotImplementedError |
|
|
|
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None, has_special_tokens=False): |
|
if has_special_tokens: |
|
return [0] * (len(token_ids_0)+2) + ([1] * (len(token_ids_1)+2) if token_ids_1 is not None else []) |
|
else: |
|
return [0] * len(token_ids_0) + ([1] * len(token_ids_1) if token_ids_1 is not None else []) |
|
|
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: |
|
filename = VOCAB_FILES_NAMES["vocab_file"] |
|
if filename_prefix is not None: |
|
filename = filename_prefix + "-" + filename |
|
full_path = os.path.join(save_directory, filename) |
|
with open(full_path, "w", encoding="utf-8") as f: |
|
json.dump({ |
|
"special_tokens": self.all_special_tokens, |
|
"vocab": self.get_vocab(), |
|
}, f, ensure_ascii=False, indent=4) |
|
return (full_path,) |
|
|
|
def encode( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
is_split_into_words: bool = False, |
|
add_special_tokens: bool = False, |
|
padding: Union[bool, str, PaddingStrategy] = False, |
|
truncation: Union[bool, str, TruncationStrategy] = None, |
|
max_length: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
**kwargs, |
|
) -> List[int]: |
|
def get_input_ids(text): |
|
if isinstance(text, str): |
|
tokens = self.tokenize(text, **kwargs) |
|
return self.convert_tokens_to_ids(tokens) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
|
if is_split_into_words: |
|
tokens = list( |
|
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) |
|
) |
|
return self.convert_tokens_to_ids(tokens) |
|
else: |
|
return self.convert_tokens_to_ids(text) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]): |
|
return text |
|
else: |
|
raise ValueError( |
|
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.") |
|
|
|
first_ids = get_input_ids(text) |
|
second_ids = get_input_ids(text_pair) if text_pair is not None else None |
|
|
|
if add_special_tokens: |
|
sequence = self.build_inputs_with_special_tokens(first_ids, second_ids) |
|
else: |
|
sequence = first_ids |
|
|
|
return sequence |
|
|
|
def prepare_for_model( |
|
self, |
|
ids: List[List[int]], |
|
pair_ids: Optional[List[List[int]]] = None, |
|
add_special_tokens: bool = True, |
|
padding: Union[bool, str, PaddingStrategy] = False, |
|
truncation: Union[bool, str, TruncationStrategy] = None, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: bool = True, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
add_word_cls: bool = True, |
|
prepend_batch_axis: bool = False, |
|
**kwargs, |
|
) -> BatchEncoding: |
|
""" |
|
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It |
|
adds special tokens, truncates sequences if overflowing while taking into account the special tokens and |
|
manages a moving window (with user defined stride) for overflowing tokens. |
|
|
|
Args: |
|
ids (`List[List[int]]`): |
|
Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and |
|
`convert_tokens_to_ids` methods. |
|
pair_ids (`List[List[int]]`, *optional*): |
|
Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` |
|
and `convert_tokens_to_ids` methods. |
|
""" |
|
|
|
|
|
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( |
|
padding=padding, |
|
truncation=truncation, |
|
max_length=max_length, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
verbose=verbose, |
|
**kwargs, |
|
) |
|
|
|
pair = bool(pair_ids is not None) |
|
len_pair_ids = len(pair_ids) if pair else 0 |
|
|
|
if return_token_type_ids and not add_special_tokens: |
|
raise ValueError( |
|
"Asking to return token_type_ids while setting add_special_tokens to False " |
|
"results in an undefined behavior. Please set add_special_tokens to True or " |
|
"set return_token_type_ids to None." |
|
) |
|
|
|
if ( |
|
return_overflowing_tokens |
|
and truncation_strategy == TruncationStrategy.LONGEST_FIRST |
|
and pair_ids is not None |
|
): |
|
raise ValueError( |
|
"Not possible to return overflowing tokens for pair of sequences with the " |
|
"`longest_first`. Please select another truncation strategy than `longest_first`, " |
|
"for instance `only_second` or `only_first`." |
|
) |
|
|
|
encoded_inputs = {} |
|
|
|
|
|
total_len = len(ids) + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) |
|
|
|
|
|
overflowing_tokens = [] |
|
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: |
|
ids, pair_ids, overflowing_tokens = self.truncate_sequences( |
|
ids, |
|
pair_ids=pair_ids, |
|
num_tokens_to_remove=total_len - max_length, |
|
truncation_strategy=truncation_strategy, |
|
stride=stride, |
|
) |
|
|
|
if return_overflowing_tokens: |
|
encoded_inputs["overflowing_tokens"] = overflowing_tokens |
|
encoded_inputs["num_truncated_tokens"] = total_len - max_length |
|
|
|
if add_special_tokens: |
|
sequence = self.build_inputs_with_special_tokens(ids, pair_ids) |
|
else: |
|
sequence = ids + pair_ids if pair else ids |
|
|
|
if add_word_cls: |
|
for word in sequence: |
|
word.insert(0, self.word_cls_token_id) |
|
|
|
|
|
encoded_inputs["input_ids"] = sequence |
|
encoded_inputs["char_input_mask"] = [[1]*len(word)+[0]*(self.max_word_length-len(word)) for word in sequence] |
|
encoded_inputs["word_input_mask"] = [1]*len(sequence) |
|
if return_token_type_ids or pair: |
|
encoded_inputs["word_type_ids"] = self.create_token_type_ids_from_sequences(ids, pair_ids, add_special_tokens) |
|
assert len(encoded_inputs["word_type_ids"]) == len(encoded_inputs["word_input_mask"]) |
|
|
|
|
|
for word in encoded_inputs["input_ids"]: |
|
if len(word) < self.max_word_length: |
|
word.extend([self.pad_token_id] * (self.max_word_length - len(word))) |
|
|
|
|
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: |
|
encoded_inputs = self.pad( |
|
encoded_inputs, |
|
max_length=max_length, |
|
padding=padding_strategy.value, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
) |
|
|
|
batch_outputs = BatchEncoding( |
|
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis |
|
) |
|
|
|
return batch_outputs |
|
|
|
def _encode_plus( |
|
self, |
|
text: Union[TextInput, PreTokenizedInput, EncodedInput], |
|
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_split_into_words: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
add_word_cls: bool = True, |
|
**kwargs, |
|
) -> BatchEncoding: |
|
def get_input_ids(text): |
|
if isinstance(text, str): |
|
tokens = self.tokenize(text, **kwargs) |
|
return self.convert_tokens_to_ids(tokens) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
|
if is_split_into_words: |
|
tokens = list( |
|
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) |
|
) |
|
return self.convert_tokens_to_ids(tokens) |
|
else: |
|
return self.convert_tokens_to_ids(text) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]): |
|
return text |
|
else: |
|
raise ValueError( |
|
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.") |
|
|
|
if return_offsets_mapping: |
|
raise NotImplementedError( |
|
"return_offset_mapping is not available when using Python tokenizers. " |
|
"To use this feature, change your tokenizer to one deriving from " |
|
"transformers.PreTrainedTokenizerFast. " |
|
"More information on available tokenizers at " |
|
"https://github.com/huggingface/transformers/pull/2674" |
|
) |
|
|
|
first_ids = get_input_ids(text) |
|
second_ids = get_input_ids(text_pair) if text_pair is not None else None |
|
|
|
return self.prepare_for_model( |
|
first_ids, |
|
pair_ids=second_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding=padding_strategy.value, |
|
truncation=truncation_strategy.value, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_tensors=return_tensors, |
|
prepend_batch_axis=True, |
|
return_attention_mask=return_attention_mask, |
|
return_token_type_ids=return_token_type_ids, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_length=return_length, |
|
verbose=verbose, |
|
add_word_cls=add_word_cls, |
|
) |
|
|
|
def _batch_encode_plus( |
|
self, |
|
batch_text_or_text_pairs: Union[ |
|
List[TextInput], |
|
List[TextInputPair], |
|
List[PreTokenizedInput], |
|
List[PreTokenizedInputPair], |
|
List[EncodedInput], |
|
List[EncodedInputPair], |
|
], |
|
add_special_tokens: bool = True, |
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, |
|
max_length: Optional[int] = None, |
|
stride: int = 0, |
|
is_split_into_words: bool = False, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
return_token_type_ids: Optional[bool] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_overflowing_tokens: bool = False, |
|
return_special_tokens_mask: bool = False, |
|
return_offsets_mapping: bool = False, |
|
return_length: bool = False, |
|
verbose: bool = True, |
|
**kwargs, |
|
) -> BatchEncoding: |
|
def get_input_ids(text): |
|
if isinstance(text, str): |
|
tokens = self.tokenize(text, **kwargs) |
|
return self.convert_tokens_to_ids(tokens) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): |
|
if is_split_into_words: |
|
tokens = list( |
|
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) |
|
) |
|
return self.convert_tokens_to_ids(tokens) |
|
else: |
|
return self.convert_tokens_to_ids(text) |
|
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], List[int]): |
|
return text |
|
else: |
|
raise ValueError( |
|
"Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." |
|
) |
|
|
|
if return_offsets_mapping: |
|
raise NotImplementedError( |
|
"return_offset_mapping is not available when using Python tokenizers. " |
|
"To use this feature, change your tokenizer to one deriving from " |
|
"transformers.PreTrainedTokenizerFast." |
|
) |
|
|
|
input_ids = [] |
|
for ids_or_pair_ids in batch_text_or_text_pairs: |
|
if not isinstance(ids_or_pair_ids, (list, tuple)): |
|
ids, pair_ids = ids_or_pair_ids, None |
|
elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): |
|
ids, pair_ids = ids_or_pair_ids, None |
|
else: |
|
ids, pair_ids = ids_or_pair_ids |
|
|
|
first_ids = get_input_ids(ids) |
|
second_ids = get_input_ids(pair_ids) if pair_ids is not None else None |
|
input_ids.append((first_ids, second_ids)) |
|
|
|
batch_outputs = self._batch_prepare_for_model( |
|
input_ids, |
|
add_special_tokens=add_special_tokens, |
|
padding_strategy=padding_strategy, |
|
truncation_strategy=truncation_strategy, |
|
max_length=max_length, |
|
stride=stride, |
|
pad_to_multiple_of=pad_to_multiple_of, |
|
return_attention_mask=return_attention_mask, |
|
return_token_type_ids=return_token_type_ids, |
|
return_overflowing_tokens=return_overflowing_tokens, |
|
return_special_tokens_mask=return_special_tokens_mask, |
|
return_length=return_length, |
|
return_tensors=return_tensors, |
|
verbose=verbose, |
|
) |
|
|
|
return BatchEncoding(batch_outputs) |
|
|
|
def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, split_long_words: bool = True) -> List[List[str]]: |
|
text = unicodedata.normalize('NFKC', text) |
|
if split_long_words: |
|
tokenized_text = [] |
|
for token in text.split(): |
|
tokens = [char for char in token] |
|
tokenized_text.extend( |
|
tokens[i: i + self.max_word_length - 1] for i in range(0, len(tokens), self.max_word_length - 1)) |
|
return tokenized_text |
|
else: |
|
return [[char for char in token] for token in text.split()] |
|
|
|
def pad( |
|
self, |
|
encoded_inputs: Union[ |
|
BatchEncoding, |
|
List[BatchEncoding], |
|
Dict[str, EncodedInput], |
|
Dict[str, List[EncodedInput]], |
|
List[Dict[str, EncodedInput]], |
|
], |
|
padding: Union[bool, str, PaddingStrategy] = True, |
|
max_length: Optional[int] = None, |
|
pad_to_multiple_of: Optional[int] = None, |
|
return_attention_mask: Optional[bool] = None, |
|
return_tensors: Optional[Union[str, TensorType]] = None, |
|
|
|
verbose: bool = True, |
|
) -> BatchEncoding: |
|
|
|
|
|
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping): |
|
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
required_input = encoded_inputs["input_ids"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( |
|
padding=padding, max_length=max_length, verbose=verbose) |
|
|
|
if padding_strategy == PaddingStrategy.DO_NOT_PAD: |
|
return encoded_inputs |
|
|
|
assert (padding_strategy == PaddingStrategy.LONGEST) |
|
|
|
longest_in_batch = max(len(f) for f in required_input) |
|
batch_outputs = {} |
|
batch_outputs["input_ids"] = [f + self.pad_word*(longest_in_batch - len(f)) for f in encoded_inputs["input_ids"]] |
|
batch_outputs["char_input_mask"] = [f + self.pad_mask_word*(longest_in_batch - len(f)) for f in encoded_inputs["char_input_mask"]] |
|
|
|
batch_outputs["word_input_mask"] = \ |
|
[f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs['word_input_mask']] |
|
|
|
if "word_type_ids" in encoded_inputs: |
|
batch_outputs["word_type_ids"] = [f + [0]*(longest_in_batch - len(f)) for f in encoded_inputs["word_type_ids"]] |
|
|
|
batch_outputs["char_input_mask"] = torch.tensor(batch_outputs["char_input_mask"], dtype=torch.bool) |
|
batch_outputs["word_input_mask"] = torch.tensor(batch_outputs["word_input_mask"], dtype=torch.bool) |
|
|
|
|
|
label_fields = ('labels', 'upos', 'feats', 'heads', 'deprels', 'lemmas') |
|
label_names = [feature for feature in encoded_inputs.keys() if feature in label_fields] |
|
|
|
if len(label_names) > 0: |
|
def to_list(tensor_or_iterable): |
|
if is_torch_tensor(tensor_or_iterable): |
|
return tensor_or_iterable.tolist() |
|
return list(tensor_or_iterable) |
|
|
|
for label_name in label_names: |
|
if label_name not in encoded_inputs: |
|
continue |
|
labels = encoded_inputs[label_name] |
|
label_pad_word = [[self.label_pad_token_id]*self.max_word_length] |
|
if self.padding_side == "right": |
|
batch_outputs[label_name] = [ |
|
to_list(label) + label_pad_word * (longest_in_batch - len(label)) for label in labels |
|
] |
|
else: |
|
batch_outputs[label_name] = [ |
|
label_pad_word * (longest_in_batch - len(label)) + to_list(label) for label in labels |
|
] |
|
|
|
return BatchEncoding(batch_outputs, tensor_type=return_tensors) |
|
|