File size: 959 Bytes
762e05d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json

def count_mismatch(file_path):

    count_mismatch = 0 
    with open(file_path, 'r') as file:
        for line_number, line in enumerate(file, start=1):
            data = json.loads(line)
            tokens, tags = data['tokens'], data['tags_knowledge']
            if len(tokens) != len(tags):
                count_mismatch += 1
    
    return count_mismatch

def delete_mismatched_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    with open(file_path, 'w') as file:
        for line in lines:
            data = json.loads(line)
            tokens, tags = data['tokens'], data['tags_knowledge']
            if len(tokens) == len(tags):
                file.write(line)


if __name__ == "__main__":
    file_path = 'data/tags-04-01-2025.jsonl'
    count = count_mismatch(file_path)

    if  count > 0:
        delete_mismatched_lines(file_path)
        print(f"Deleted {count} mismatched lines.")