File size: 1,609 Bytes
476ac07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import json
from datetime import datetime


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('src_file', help='source file path')
    parser.add_argument('dst_file', help='destination file path')
    parser.add_argument(
        '--categories',
        nargs='+',
        default=['cs.AI', 'cs.CL', 'cs.CV'],
        help='target categories')
    parser.add_argument(
        '--start-date',
        default='2020-01-01',
        help='start date (format: YYYY-MM-DD)')

    args = parser.parse_args()
    return args


def has_intersection(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    return len(set1.intersection(set2)) > 0


def read_json_file(file_path):
    data = []
    with open(file_path) as file:
        for line in file:
            try:
                json_data = json.loads(line)
                data.append(json_data)
            except json.JSONDecodeError:
                print(f'Failed to parse line: {line}')
    return data


def main():
    args = parse_args()
    json_data = read_json_file(args.src_file)
    from_time = datetime.strptime(args.start_date, '%Y-%m-%d')
    filtered_data = [
        item for item in json_data
        if has_intersection(args.categories, item['categories'].split())
        and datetime.strptime(item['update_date'], '%Y-%m-%d') >= from_time
    ]

    with open(args.dst_file, 'w') as file:
        json.dump(filtered_data, file)

    print(f'Save to {args.dst_file}\n{len(filtered_data)} items')


if __name__ == '__main__':
    main()