Spaces:
Sleeping
Sleeping
import json | |
def count_mismatch(file_path): | |
count_mismatch = 0 | |
with open(file_path, 'r') as file: | |
for line_number, line in enumerate(file, start=1): | |
data = json.loads(line) | |
tokens, tags = data['tokens'], data['tags_knowledge'] | |
if len(tokens) != len(tags): | |
count_mismatch += 1 | |
return count_mismatch | |
def delete_mismatched_lines(file_path): | |
with open(file_path, 'r') as file: | |
lines = file.readlines() | |
with open(file_path, 'w') as file: | |
for line in lines: | |
data = json.loads(line) | |
tokens, tags = data['tokens'], data['tags_knowledge'] | |
if len(tokens) == len(tags): | |
file.write(line) | |
if __name__ == "__main__": | |
file_path = 'data/tags-04-01-2025.jsonl' | |
count = count_mismatch(file_path) | |
if count > 0: | |
delete_mismatched_lines(file_path) | |
print(f"Deleted {count} mismatched lines.") |