Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,223 Bytes
568e264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from abc import ABC, abstractmethod, abstractproperty
from typing import Dict, List, Tuple, Union
T = Union[str, bytes]
class BaseTokenizer(ABC):
def tokenize(self, line: str) -> Tuple[List[T], List[int]]:
tokens = self.text2tokens(line)
ids = self.tokens2ids(tokens)
return tokens, ids
def detokenize(self, ids: List[int]) -> Tuple[str, List[T]]:
tokens = self.ids2tokens(ids)
text = self.tokens2text(tokens)
return text, tokens
@abstractmethod
def text2tokens(self, line: str) -> List[T]:
raise NotImplementedError("abstract method")
@abstractmethod
def tokens2text(self, tokens: List[T]) -> str:
raise NotImplementedError("abstract method")
@abstractmethod
def tokens2ids(self, tokens: List[T]) -> List[int]:
raise NotImplementedError("abstract method")
@abstractmethod
def ids2tokens(self, ids: List[int]) -> List[T]:
raise NotImplementedError("abstract method")
@abstractmethod
def vocab_size(self) -> int:
raise NotImplementedError("abstract method")
@abstractproperty
def symbol_table(self) -> Dict[T, int]:
raise NotImplementedError("abstract method")
|