|
|
|
import os
|
|
import sys
|
|
import torch
|
|
import logging
|
|
import multiprocessing as mp
|
|
|
|
from mineru_single import to_markdown
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
def worker(worker_id, gpu_id, pdf_list, output_dir):
|
|
"""
|
|
Worker function:
|
|
1) Assigns CUDA to this process (if available).
|
|
2) Calls `to_markdown` for each file.
|
|
"""
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
|
|
for pdf_path in pdf_list:
|
|
try:
|
|
logging.info(f"Worker {worker_id}, GPU {gpu_id} -> {pdf_path}")
|
|
to_markdown(
|
|
file_path=pdf_path,
|
|
output_dir=output_dir
|
|
)
|
|
except Exception as e:
|
|
logging.error(f"Worker {worker_id} error on {pdf_path}: {e}")
|
|
|
|
def process_batch_in_parallel(pdf_paths, output_dir="./output", num_workers=2, num_gpus=1):
|
|
"""
|
|
Takes a list of PDF file paths, spawns `num_workers` processes, each processing a chunk.
|
|
"""
|
|
if not pdf_paths:
|
|
logging.info("No PDFs to process.")
|
|
return
|
|
|
|
|
|
chunk_size = (len(pdf_paths) + num_workers - 1) // num_workers
|
|
processes = []
|
|
|
|
for worker_id in range(num_workers):
|
|
start_idx = worker_id * chunk_size
|
|
end_idx = start_idx + chunk_size
|
|
subset = pdf_paths[start_idx:end_idx]
|
|
if not subset:
|
|
break
|
|
|
|
gpu_id = worker_id % num_gpus
|
|
p = mp.Process(target=worker, args=(worker_id, gpu_id, subset, output_dir))
|
|
p.start()
|
|
processes.append(p)
|
|
|
|
for p in processes:
|
|
p.join()
|
|
|
|
logging.info("All parallel processing complete.") |