Spaces:
Running
Running
import sys | |
def clean_vocab(in_vocab_fname: str, out_vocab_fname: str): | |
""" | |
Cleans a vocabulary file by filtering out invalid lines. | |
Args: | |
in_vocab_fname (str): path of the input vocabulary file. | |
out_vocab_fname (str): path of the input vocabulary file. | |
""" | |
with open(in_vocab_fname, "r", encoding="utf-8") as infile, open( | |
out_vocab_fname, "w", encoding="utf-8" | |
) as outfile: | |
for i, line in enumerate(infile): | |
fields = line.strip("\r\n ").split(" ") | |
if len(fields) == 2: | |
outfile.write(line) | |
if len(fields) != 2: | |
print(f"{i}: {line.strip()}") | |
for c in line: | |
print(f"{c}:{hex(ord(c))}") | |
if __name__ == "__main__": | |
in_vocab_fname = sys.argv[1] | |
out_vocab_fname = sys.argv[2] | |
clean_vocab(in_vocab_fname, out_vocab_fname) | |