import argparse import os import glob import json from multiprocessing import Pool, Manager import pandas as pd from natsort import index_natsorted from .logger import logger def process_file(file_path, shared_list): with open(file_path, "r") as f: for line in f: data = json.loads(line) shared_list.append(data) def parse_args(): parser = argparse.ArgumentParser(description="Gather all jsonl files in a folder (meta_folder) to a single jsonl file (meta_file_path).") parser.add_argument("--meta_folder", type=str, required=True) parser.add_argument("--meta_file_path", type=str, required=True) parser.add_argument("--video_path_column", type=str, default="video_path") parser.add_argument("--n_jobs", type=int, default=1) args = parser.parse_args() return args def main(): args = parse_args() jsonl_files = glob.glob(os.path.join(args.meta_folder, "*.jsonl")) with Manager() as manager: shared_list = manager.list() with Pool(processes=args.n_jobs) as pool: for file_path in jsonl_files: pool.apply_async(process_file, args=(file_path, shared_list)) pool.close() pool.join() with open(args.meta_file_path, "w") as f: for item in shared_list: f.write(json.dumps(item) + '\n') df = pd.read_json(args.meta_file_path, lines=True) df = df.iloc[index_natsorted(df[args.video_path_column])].reset_index(drop=True) logger.info(f"Save the gathered single jsonl file to {args.meta_file_path}.") df.to_json(args.meta_file_path, orient="records", lines=True, force_ascii=False) if __name__ == '__main__': main()