MinerU / parallel_multiproc.py
SkyNait's picture
Hypothethical Parallel proccesing
11551ca
raw
history blame
1.69 kB
#!/usr/bin/env python3
import os
import sys
import torch
import logging
import multiprocessing as mp
from mineru_single import to_markdown
logging.basicConfig(level=logging.INFO)
def worker(worker_id, gpu_id, pdf_list, output_dir):
"""
Worker function:
1) Assigns CUDA to this process (if available).
2) Calls `to_markdown` for each file.
"""
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
for pdf_path in pdf_list:
try:
logging.info(f"Worker {worker_id}, GPU {gpu_id} -> {pdf_path}")
to_markdown(
file_path=pdf_path,
output_dir=output_dir
)
except Exception as e:
logging.error(f"Worker {worker_id} error on {pdf_path}: {e}")
def process_batch_in_parallel(pdf_paths, output_dir="./output", num_workers=2, num_gpus=1):
"""
Takes a list of PDF file paths, spawns `num_workers` processes, each processing a chunk.
"""
if not pdf_paths:
logging.info("No PDFs to process.")
return
# chunk the pdf_paths
chunk_size = (len(pdf_paths) + num_workers - 1) // num_workers
processes = []
for worker_id in range(num_workers):
start_idx = worker_id * chunk_size
end_idx = start_idx + chunk_size
subset = pdf_paths[start_idx:end_idx]
if not subset:
break
gpu_id = worker_id % num_gpus
p = mp.Process(target=worker, args=(worker_id, gpu_id, subset, output_dir))
p.start()
processes.append(p)
for p in processes:
p.join()
logging.info("All parallel processing complete.")