#!/usr/bin/env python3 import os import glob import logging import torch.multiprocessing as mp from parallel_multiproc import process_batch_in_parallel logging.basicConfig(level=logging.INFO) def main(): pdf_dir = "/home/user/app/test_pdf" output_dir = "/home/user/app/pdf_output" os.makedirs(output_dir, exist_ok=True) pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf")) logging.info(f"Found {len(pdf_files)} PDF files to process") process_batch_in_parallel( pdf_paths=pdf_files, output_dir=output_dir, num_workers=2, #for our T4 small specifically, do not change it num_gpus=1 ) if __name__ == "__main__": mp.set_start_method("spawn", force=True) main()