Spaces:
Build error
Build error
import json | |
import argparse | |
from tqdm import tqdm | |
def process_line(line, old_text, new_text): | |
# 解析 JSON 行 | |
data = json.loads(line) | |
# 递归函数来处理嵌套的字典和列表 | |
def replace_text(obj): | |
if isinstance(obj, dict): | |
return {k: replace_text(v) for k, v in obj.items()} | |
elif isinstance(obj, list): | |
return [replace_text(item) for item in obj] | |
elif isinstance(obj, str): | |
return obj.replace(old_text, new_text) | |
else: | |
return obj | |
# 处理整个 JSON 对象 | |
processed_data = replace_text(data) | |
# 将处理后的对象转回 JSON 字符串 | |
return json.dumps(processed_data, ensure_ascii=False) | |
def main(input_file, output_file, old_text, new_text): | |
with open(input_file, 'r', encoding='utf-8') as infile, \ | |
open(output_file, 'w', encoding='utf-8') as outfile: | |
# 计算总行数用于进度条 | |
total_lines = sum(1 for _ in infile) | |
infile.seek(0) # 重置文件指针到开头 | |
# 使用 tqdm 创建进度条 | |
for line in tqdm(infile, total=total_lines, desc="Processing"): | |
processed_line = process_line(line.strip(), old_text, new_text) | |
outfile.write(processed_line + '\n') | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Replace text in a JSONL file.") | |
parser.add_argument("input_file", help="Input JSONL file to process") | |
parser.add_argument("output_file", help="Output file for processed JSONL") | |
parser.add_argument("--old_text", default="尖米", help="Text to be replaced") | |
parser.add_argument("--new_text", default="机智流", help="Text to replace with") | |
args = parser.parse_args() | |
main(args.input_file, args.output_file, args.old_text, args.new_text) |