File size: 1,687 Bytes
11551ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
#!/usr/bin/env python3
import os
import sys
import torch
import logging
import multiprocessing as mp
from mineru_single import to_markdown
logging.basicConfig(level=logging.INFO)
def worker(worker_id, gpu_id, pdf_list, output_dir):
"""
Worker function:
1) Assigns CUDA to this process (if available).
2) Calls `to_markdown` for each file.
"""
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
for pdf_path in pdf_list:
try:
logging.info(f"Worker {worker_id}, GPU {gpu_id} -> {pdf_path}")
to_markdown(
file_path=pdf_path,
output_dir=output_dir
)
except Exception as e:
logging.error(f"Worker {worker_id} error on {pdf_path}: {e}")
def process_batch_in_parallel(pdf_paths, output_dir="./output", num_workers=2, num_gpus=1):
"""
Takes a list of PDF file paths, spawns `num_workers` processes, each processing a chunk.
"""
if not pdf_paths:
logging.info("No PDFs to process.")
return
# chunk the pdf_paths
chunk_size = (len(pdf_paths) + num_workers - 1) // num_workers
processes = []
for worker_id in range(num_workers):
start_idx = worker_id * chunk_size
end_idx = start_idx + chunk_size
subset = pdf_paths[start_idx:end_idx]
if not subset:
break
gpu_id = worker_id % num_gpus
p = mp.Process(target=worker, args=(worker_id, gpu_id, subset, output_dir))
p.start()
processes.append(p)
for p in processes:
p.join()
logging.info("All parallel processing complete.") |