Spaces:
Running
Running
File size: 893 Bytes
d44849f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import sys
def clean_vocab(in_vocab_fname: str, out_vocab_fname: str):
"""
Cleans a vocabulary file by filtering out invalid lines.
Args:
in_vocab_fname (str): path of the input vocabulary file.
out_vocab_fname (str): path of the input vocabulary file.
"""
with open(in_vocab_fname, "r", encoding="utf-8") as infile, open(
out_vocab_fname, "w", encoding="utf-8"
) as outfile:
for i, line in enumerate(infile):
fields = line.strip("\r\n ").split(" ")
if len(fields) == 2:
outfile.write(line)
if len(fields) != 2:
print(f"{i}: {line.strip()}")
for c in line:
print(f"{c}:{hex(ord(c))}")
if __name__ == "__main__":
in_vocab_fname = sys.argv[1]
out_vocab_fname = sys.argv[2]
clean_vocab(in_vocab_fname, out_vocab_fname)
|