import json def count_mismatch(file_path): count_mismatch = 0 with open(file_path, 'r') as file: for line_number, line in enumerate(file, start=1): data = json.loads(line) tokens, tags = data['tokens'], data['tags_knowledge'] if len(tokens) != len(tags): count_mismatch += 1 return count_mismatch def delete_mismatched_lines(file_path): with open(file_path, 'r') as file: lines = file.readlines() with open(file_path, 'w') as file: for line in lines: data = json.loads(line) tokens, tags = data['tokens'], data['tags_knowledge'] if len(tokens) == len(tags): file.write(line) if __name__ == "__main__": file_path = 'data/tags-04-01-2025.jsonl' count = count_mismatch(file_path) if count > 0: delete_mismatched_lines(file_path) print(f"Deleted {count} mismatched lines.")