pranavSIT
/

PaliOpenVocabSegmentation

added pali inference

74e8f2f 12 months ago

1.58 kB

	/**
	* @license
	* Copyright Big Vision Authors
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/**
	* @fileoverview Utility code shared between tokenizers.
	*/

	/**
	* A vocabulary consists of a list of tokens, and optional numerical value.
	* The numerical value is used by the unigram algorithnm to find the best
	* tokenizaion, and is ignored by the BPE algorithm.
	*/
	export type Vocabulary = Array<[string, number]>;

	/**
	* Converts a string to a sequence of tokens.
	*/
	export interface Tokenizer {
	encode(input: string): number[];
	}

	/**
	* Factory for new `Tokenizer`.
	*/
	export interface TokenizerConstructor {
	new (vocabulary: Vocabulary): Tokenizer;
	}

	/**
	* Unicode-aware character iteration of strings.
	*/
	export const stringToChars = (input: string): string[] => {
	const symbols = [];
	for (const symbol of input) {
	symbols.push(symbol);
	}
	return symbols;
	};

	/**
	* Special separator character used to delimit sub-word tokens.
	*/
	export const TOKEN_SEPARATOR =
	'\u2581'; // This is the unicode character 'lower one eighth block'.