|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import string |
|
from collections import OrderedDict |
|
from typing import Dict, List, Union |
|
|
|
PRESERVE_ORDER_KEY = "preserve_order" |
|
EOS = "<EOS>" |
|
|
|
|
|
class TokenParser: |
|
""" |
|
Parses tokenized/classified text, e.g. 'tokens { money { integer: "20" currency: "$" } } tokens { name: "left"}' |
|
|
|
Args |
|
text: tokenized text |
|
""" |
|
|
|
def __call__(self, text): |
|
""" |
|
Setup function |
|
|
|
Args: |
|
text: text to be parsed |
|
|
|
""" |
|
self.text = text |
|
self.len_text = len(text) |
|
self.char = text[0] |
|
self.index = 0 |
|
|
|
def parse(self) -> List[dict]: |
|
""" |
|
Main function. Implements grammar: |
|
A -> space F space F space F ... space |
|
|
|
Returns list of dictionaries |
|
""" |
|
l = list() |
|
while self.parse_ws(): |
|
token = self.parse_token() |
|
if not token: |
|
break |
|
l.append(token) |
|
return l |
|
|
|
def parse_token(self) -> Dict[str, Union[str, dict]]: |
|
""" |
|
Implements grammar: |
|
F-> no_space KG no_space |
|
|
|
Returns: K, G as dictionary values |
|
""" |
|
d = OrderedDict() |
|
key = self.parse_string_key() |
|
if key is None: |
|
return None |
|
self.parse_ws() |
|
if key == PRESERVE_ORDER_KEY: |
|
self.parse_char(":") |
|
self.parse_ws() |
|
value = self.parse_chars("true") |
|
else: |
|
value = self.parse_token_value() |
|
|
|
d[key] = value |
|
return d |
|
|
|
def parse_token_value(self) -> Union[str, dict]: |
|
""" |
|
Implements grammar: |
|
G-> no_space :"VALUE" no_space | no_space {A} no_space |
|
|
|
Returns: string or dictionary |
|
""" |
|
if self.char == ":": |
|
self.parse_char(":") |
|
self.parse_ws() |
|
self.parse_char("\"") |
|
value_string = self.parse_string_value() |
|
self.parse_char("\"") |
|
return value_string |
|
elif self.char == "{": |
|
d = OrderedDict() |
|
self.parse_char("{") |
|
list_token_dicts = self.parse() |
|
|
|
for tok_dict in list_token_dicts: |
|
for k, v in tok_dict.items(): |
|
d[k] = v |
|
self.parse_char("}") |
|
return d |
|
else: |
|
raise ValueError() |
|
|
|
def parse_char(self, exp) -> bool: |
|
""" |
|
Parses character |
|
|
|
Args: |
|
exp: character to read in |
|
|
|
Returns true if successful |
|
""" |
|
assert self.char == exp |
|
self.read() |
|
return True |
|
|
|
def parse_chars(self, exp) -> bool: |
|
""" |
|
Parses characters |
|
|
|
Args: |
|
exp: characters to read in |
|
|
|
Returns true if successful |
|
""" |
|
ok = False |
|
for x in exp: |
|
ok |= self.parse_char(x) |
|
return ok |
|
|
|
def parse_string_key(self) -> str: |
|
""" |
|
Parses string key, can only contain ascii and '_' characters |
|
|
|
Returns parsed string key |
|
""" |
|
assert self.char not in string.whitespace and self.char != EOS |
|
incl_criterium = string.ascii_letters + "_" |
|
l = [] |
|
while self.char in incl_criterium: |
|
l.append(self.char) |
|
if not self.read(): |
|
raise ValueError() |
|
|
|
if not l: |
|
return None |
|
return "".join(l) |
|
|
|
def parse_string_value(self) -> str: |
|
""" |
|
Parses string value, ends with quote followed by space |
|
|
|
Returns parsed string value |
|
""" |
|
assert self.char not in string.whitespace and self.char != EOS |
|
l = [] |
|
while self.char != "\"" or self.text[self.index + 1] != " ": |
|
l.append(self.char) |
|
if not self.read(): |
|
raise ValueError() |
|
|
|
if not l: |
|
return None |
|
return "".join(l) |
|
|
|
def parse_ws(self): |
|
""" |
|
Deletes whitespaces. |
|
|
|
Returns true if not EOS after parsing |
|
""" |
|
not_eos = self.char != EOS |
|
while not_eos and self.char == " ": |
|
not_eos = self.read() |
|
return not_eos |
|
|
|
def read(self): |
|
""" |
|
Reads in next char. |
|
|
|
Returns true if not EOS |
|
""" |
|
if self.index < self.len_text - 1: |
|
self.index += 1 |
|
self.char = self.text[self.index] |
|
return True |
|
self.char = EOS |
|
return False |
|
|