Spaces:
Runtime error
Runtime error
import sys | |
from transformers import AutoTokenizer | |
dataset = sys.argv[1] | |
model_name_or_path = sys.argv[2] | |
max_len = int(sys.argv[3]) | |
subword_len_counter = 0 | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) | |
max_len -= tokenizer.num_special_tokens_to_add() | |
with open(dataset, "rt") as f_p: | |
for line in f_p: | |
line = line.rstrip() | |
if not line: | |
print(line) | |
subword_len_counter = 0 | |
continue | |
token = line.split()[0] | |
current_subwords_len = len(tokenizer.tokenize(token)) | |
# Token contains strange control characters like \x96 or \x95 | |
# Just filter out the complete line | |
if current_subwords_len == 0: | |
continue | |
if (subword_len_counter + current_subwords_len) > max_len: | |
print("") | |
print(line) | |
subword_len_counter = current_subwords_len | |
continue | |
subword_len_counter += current_subwords_len | |
print(line) | |