#!/usr/bin/env python3 import os import sys import torch import logging import multiprocessing as mp from mineru_single import to_markdown logging.basicConfig(level=logging.INFO) def worker(worker_id, gpu_id, pdf_list, output_dir): """ Worker function: 1) Assigns CUDA to this process (if available). 2) Calls `to_markdown` for each file. """ os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) for pdf_path in pdf_list: try: logging.info(f"Worker {worker_id}, GPU {gpu_id} -> {pdf_path}") to_markdown( file_path=pdf_path, output_dir=output_dir ) except Exception as e: logging.error(f"Worker {worker_id} error on {pdf_path}: {e}") def process_batch_in_parallel(pdf_paths, output_dir="./output", num_workers=2, num_gpus=1): """ Takes a list of PDF file paths, spawns `num_workers` processes, each processing a chunk. """ if not pdf_paths: logging.info("No PDFs to process.") return # chunk the pdf_paths chunk_size = (len(pdf_paths) + num_workers - 1) // num_workers processes = [] for worker_id in range(num_workers): start_idx = worker_id * chunk_size end_idx = start_idx + chunk_size subset = pdf_paths[start_idx:end_idx] if not subset: break gpu_id = worker_id % num_gpus p = mp.Process(target=worker, args=(worker_id, gpu_id, subset, output_dir)) p.start() processes.append(p) for p in processes: p.join() logging.info("All parallel processing complete.")