DuyTa's picture
Upload folder using huggingface_hub
c3b1078 verified
raw
history blame
5.1 kB
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import string
from collections import OrderedDict
from typing import Dict, List, Union
PRESERVE_ORDER_KEY = "preserve_order"
EOS = "<EOS>"
class TokenParser:
"""
Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}'
Args
text: tokenized text
"""
def __call__(self, text):
"""
Setup function
Args:
text: text to be parsed
"""
self.text = text
self.len_text = len(text)
self.char = text[0] # cannot handle empty string
self.index = 0
def parse(self) -> List[dict]:
"""
Main function. Implements grammar:
A -> space F space F space F ... space
Returns list of dictionaries
"""
l = list()
while self.parse_ws():
token = self.parse_token()
if not token:
break
l.append(token)
return l
def parse_token(self) -> Dict[str, Union[str, dict]]:
"""
Implements grammar:
F-> no_space KG no_space
Returns: K, G as dictionary values
"""
d = OrderedDict()
key = self.parse_string_key()
if key is None:
return None
self.parse_ws()
if key == PRESERVE_ORDER_KEY:
self.parse_char(":")
self.parse_ws()
value = self.parse_chars("true")
else:
value = self.parse_token_value()
d[key] = value
return d
def parse_token_value(self) -> Union[str, dict]:
"""
Implements grammar:
G-> no_space :"VALUE" no_space | no_space {A} no_space
Returns: string or dictionary
"""
if self.char == ":":
self.parse_char(":")
self.parse_ws()
self.parse_char("\"")
value_string = self.parse_string_value()
self.parse_char("\"")
return value_string
elif self.char == "{":
d = OrderedDict()
self.parse_char("{")
list_token_dicts = self.parse()
# flatten tokens
for tok_dict in list_token_dicts:
for k, v in tok_dict.items():
d[k] = v
self.parse_char("}")
return d
else:
raise ValueError()
def parse_char(self, exp) -> bool:
"""
Parses character
Args:
exp: character to read in
Returns true if successful
"""
assert self.char == exp
self.read()
return True
def parse_chars(self, exp) -> bool:
"""
Parses characters
Args:
exp: characters to read in
Returns true if successful
"""
ok = False
for x in exp:
ok |= self.parse_char(x)
return ok
def parse_string_key(self) -> str:
"""
Parses string key, can only contain ascii and '_' characters
Returns parsed string key
"""
assert self.char not in string.whitespace and self.char != EOS
incl_criterium = string.ascii_letters + "_"
l = []
while self.char in incl_criterium:
l.append(self.char)
if not self.read():
raise ValueError()
if not l:
return None
return "".join(l)
def parse_string_value(self) -> str:
"""
Parses string value, ends with quote followed by space
Returns parsed string value
"""
assert self.char not in string.whitespace and self.char != EOS
l = []
while self.char != "\"" or self.text[self.index + 1] != " ":
l.append(self.char)
if not self.read():
raise ValueError()
if not l:
return None
return "".join(l)
def parse_ws(self):
"""
Deletes whitespaces.
Returns true if not EOS after parsing
"""
not_eos = self.char != EOS
while not_eos and self.char == " ":
not_eos = self.read()
return not_eos
def read(self):
"""
Reads in next char.
Returns true if not EOS
"""
if self.index < self.len_text - 1: # should be unique
self.index += 1
self.char = self.text[self.index]
return True
self.char = EOS
return False