// Copyright 2020 Google LLC | |
// | |
// Licensed under the Apache License, Version 2.0 (the "License"); | |
// you may not use this file except in compliance with the License. | |
// You may obtain a copy of the License at | |
// | |
// https://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, software | |
// distributed under the License is distributed on an "AS IS" BASIS, | |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
// See the License for the specific language governing permissions and | |
// limitations under the License. | |
// | |
// Configuration for the text encoder op. | |
namespace libtextclassifier3; | |
enum SentencePieceMatcherType : byte { | |
MAPPED_TRIE = 0, | |
SORTED_STRING_TABLE = 1, | |
} | |
table TextEncoderConfig { | |
// Code that is used as encoding of the start code. | |
start_code:int32 = 0; | |
// Code that is used as encoding of the end code. | |
end_code:int32 = 1; | |
// This value is added to all codes to make them not intersect with | |
// `start_code` and `end_code`. | |
encoding_offset:int32 = 2; | |
// Code that is used for out-of-dictionary characters. | |
unknown_code:int32 = -1; | |
// Penalty associated with the unknown code. | |
unknown_score:float; | |
// Normalization options. | |
// Serialized normalization charsmap. | |
normalization_charsmap:string; | |
normalization_charsmap_values:string; | |
// Whether to add dummy whitespace at the beginning of the text in order to | |
// treat "world" in "world" and "hello world" uniformly. | |
add_dummy_prefix:bool = true; | |
// Whether to remove leading, trailing and duplicate internal whitespace. | |
remove_extra_whitespaces:bool = true; | |
// Whether to replace whitespace with a meta symbol. | |
escape_whitespaces:bool = true; | |
// Sentence pieces scores. | |
pieces_scores:[float]; | |
// Serialized sentence pieces. | |
pieces:string; | |
pieces_offsets:[uint32]; | |
matcher_type: SentencePieceMatcherType = MAPPED_TRIE; | |
} |