|
|
|
class Normalizer: |
|
""" |
|
Base class for all normalizers |
|
|
|
This class is not supposed to be instantiated directly. Instead, any implementation of a |
|
Normalizer will return an instance of this class when instantiated. |
|
""" |
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class BertNormalizer(Normalizer): |
|
""" |
|
BertNormalizer |
|
|
|
Takes care of normalizing raw text before giving it to a Bert model. |
|
This includes cleaning the text, handling accents, chinese chars and lowercasing |
|
|
|
Args: |
|
clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`): |
|
Whether to clean the text, by removing any control characters |
|
and replacing all whitespaces by the classic one. |
|
|
|
handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): |
|
Whether to handle chinese chars by putting spaces around them. |
|
|
|
strip_accents (:obj:`bool`, `optional`): |
|
Whether to strip all accents. If this option is not specified (ie == None), |
|
then it will be determined by the value for `lowercase` (as in the original Bert). |
|
|
|
lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): |
|
Whether to lowercase. |
|
""" |
|
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Lowercase(Normalizer): |
|
""" |
|
Lowercase Normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class NFC(Normalizer): |
|
""" |
|
NFC Unicode Normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class NFD(Normalizer): |
|
""" |
|
NFD Unicode Normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class NFKC(Normalizer): |
|
""" |
|
NFKC Unicode Normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class NFKD(Normalizer): |
|
""" |
|
NFKD Unicode Normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Nmt(Normalizer): |
|
""" |
|
Nmt normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Precompiled(Normalizer): |
|
""" |
|
Precompiled normalizer |
|
Don't use manually it is used for compatiblity for SentencePiece. |
|
""" |
|
def __init__(self, precompiled_charsmap): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Prepend(Normalizer): |
|
""" |
|
Prepend normalizer |
|
""" |
|
def __init__(self, prepend): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Replace(Normalizer): |
|
""" |
|
Replace normalizer |
|
""" |
|
def __init__(self, pattern, content): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Sequence(Normalizer): |
|
""" |
|
Allows concatenating multiple other Normalizer as a Sequence. |
|
All the normalizers run in sequence in the given order |
|
|
|
Args: |
|
normalizers (:obj:`List[Normalizer]`): |
|
A list of Normalizer to be run as a sequence |
|
""" |
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class Strip(Normalizer): |
|
""" |
|
Strip normalizer |
|
""" |
|
def __init__(self, left=True, right=True): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|
|
class StripAccents(Normalizer): |
|
""" |
|
StripAccents normalizer |
|
""" |
|
def __init__(self): |
|
pass |
|
|
|
def normalize(self, normalized): |
|
""" |
|
Normalize a :class:`~tokenizers.NormalizedString` in-place |
|
|
|
This method allows to modify a :class:`~tokenizers.NormalizedString` to |
|
keep track of the alignment information. If you just want to see the result |
|
of the normalization on a raw string, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize_str` |
|
|
|
Args: |
|
normalized (:class:`~tokenizers.NormalizedString`): |
|
The normalized string on which to apply this |
|
:class:`~tokenizers.normalizers.Normalizer` |
|
""" |
|
pass |
|
|
|
def normalize_str(self, sequence): |
|
""" |
|
Normalize the given string |
|
|
|
This method provides a way to visualize the effect of a |
|
:class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment |
|
information. If you need to get/convert offsets, you can use |
|
:meth:`~tokenizers.normalizers.Normalizer.normalize` |
|
|
|
Args: |
|
sequence (:obj:`str`): |
|
A string to normalize |
|
|
|
Returns: |
|
:obj:`str`: A string after normalization |
|
""" |
|
pass |
|
|