File size: 959 Bytes
762e05d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import json
def count_mismatch(file_path):
count_mismatch = 0
with open(file_path, 'r') as file:
for line_number, line in enumerate(file, start=1):
data = json.loads(line)
tokens, tags = data['tokens'], data['tags_knowledge']
if len(tokens) != len(tags):
count_mismatch += 1
return count_mismatch
def delete_mismatched_lines(file_path):
with open(file_path, 'r') as file:
lines = file.readlines()
with open(file_path, 'w') as file:
for line in lines:
data = json.loads(line)
tokens, tags = data['tokens'], data['tags_knowledge']
if len(tokens) == len(tags):
file.write(line)
if __name__ == "__main__":
file_path = 'data/tags-04-01-2025.jsonl'
count = count_mismatch(file_path)
if count > 0:
delete_mismatched_lines(file_path)
print(f"Deleted {count} mismatched lines.") |