|
import sys |
|
|
|
|
|
def clean_vocab(in_vocab_fname: str, out_vocab_fname: str): |
|
""" |
|
Cleans a vocabulary file by filtering out invalid lines. |
|
|
|
Args: |
|
in_vocab_fname (str): path of the input vocabulary file. |
|
out_vocab_fname (str): path of the input vocabulary file. |
|
""" |
|
with open(in_vocab_fname, "r", encoding="utf-8") as infile, open( |
|
out_vocab_fname, "w", encoding="utf-8" |
|
) as outfile: |
|
for i, line in enumerate(infile): |
|
fields = line.strip("\r\n ").split(" ") |
|
if len(fields) == 2: |
|
outfile.write(line) |
|
if len(fields) != 2: |
|
print(f"{i}: {line.strip()}") |
|
for c in line: |
|
print(f"{c}:{hex(ord(c))}") |
|
|
|
|
|
if __name__ == "__main__": |
|
in_vocab_fname = sys.argv[1] |
|
out_vocab_fname = sys.argv[2] |
|
clean_vocab(in_vocab_fname, out_vocab_fname) |
|
|