from typing import * from loguru import logger from tqdm import tqdm import pandas as pd import datetime import multiprocessing from multiprocessing import Pool cpu_num = multiprocessing.cpu_count() logger.info(f"cpu_num: {cpu_num}") date_str = datetime.datetime.now().strftime("%Y%m%d_%Hh%Mm%Ss") def multiprocessing_mapping( mapping_func, items: List[Any], batch_size=1000, tmp_filepath=f"./output/multiprocessing_mapping_{date_str}_tmp.xlsx", ): pool = Pool(processes=cpu_num) total_rows: List[Dict[str, str]] = [] for i in tqdm(range(0, len(items), batch_size)): new_rows = pool.map(mapping_func, items[i:i+batch_size]) total_rows += new_rows df = pd.DataFrame(total_rows) df.to_excel(tmp_filepath, index=False) pool.close() pool.join() return total_rows