import json import argparse from tqdm import tqdm def process_line(line, old_text, new_text): # 解析 JSON 行 data = json.loads(line) # 递归函数来处理嵌套的字典和列表 def replace_text(obj): if isinstance(obj, dict): return {k: replace_text(v) for k, v in obj.items()} elif isinstance(obj, list): return [replace_text(item) for item in obj] elif isinstance(obj, str): return obj.replace(old_text, new_text) else: return obj # 处理整个 JSON 对象 processed_data = replace_text(data) # 将处理后的对象转回 JSON 字符串 return json.dumps(processed_data, ensure_ascii=False) def main(input_file, output_file, old_text, new_text): with open(input_file, 'r', encoding='utf-8') as infile, \ open(output_file, 'w', encoding='utf-8') as outfile: # 计算总行数用于进度条 total_lines = sum(1 for _ in infile) infile.seek(0) # 重置文件指针到开头 # 使用 tqdm 创建进度条 for line in tqdm(infile, total=total_lines, desc="Processing"): processed_line = process_line(line.strip(), old_text, new_text) outfile.write(processed_line + '\n') if __name__ == "__main__": parser = argparse.ArgumentParser(description="Replace text in a JSONL file.") parser.add_argument("input_file", help="Input JSONL file to process") parser.add_argument("output_file", help="Output file for processed JSONL") parser.add_argument("--old_text", default="尖米", help="Text to be replaced") parser.add_argument("--new_text", default="机智流", help="Text to replace with") args = parser.parse_args() main(args.input_file, args.output_file, args.old_text, args.new_text)