Spaces:
Runtime error
Runtime error
# Copyright (c) OpenMMLab. All rights reserved. | |
import argparse | |
import json | |
from datetime import datetime | |
def parse_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('src_file', help='source file path') | |
parser.add_argument('dst_file', help='destination file path') | |
parser.add_argument( | |
'--categories', | |
nargs='+', | |
default=['cs.AI', 'cs.CL', 'cs.CV'], | |
help='target categories') | |
parser.add_argument( | |
'--start-date', | |
default='2020-01-01', | |
help='start date (format: YYYY-MM-DD)') | |
args = parser.parse_args() | |
return args | |
def has_intersection(list1, list2): | |
set1 = set(list1) | |
set2 = set(list2) | |
return len(set1.intersection(set2)) > 0 | |
def read_json_file(file_path): | |
data = [] | |
with open(file_path) as file: | |
for line in file: | |
try: | |
json_data = json.loads(line) | |
data.append(json_data) | |
except json.JSONDecodeError: | |
print(f'Failed to parse line: {line}') | |
return data | |
def main(): | |
args = parse_args() | |
json_data = read_json_file(args.src_file) | |
from_time = datetime.strptime(args.start_date, '%Y-%m-%d') | |
filtered_data = [ | |
item for item in json_data | |
if has_intersection(args.categories, item['categories'].split()) | |
and datetime.strptime(item['update_date'], '%Y-%m-%d') >= from_time | |
] | |
with open(args.dst_file, 'w') as file: | |
json.dump(filtered_data, file) | |
print(f'Save to {args.dst_file}\n{len(filtered_data)} items') | |
if __name__ == '__main__': | |
main() | |