#!/usr/bin/env python3 | |
import os | |
import glob | |
import logging | |
import torch.multiprocessing as mp | |
from parallel_multiproc import process_batch_in_parallel | |
logging.basicConfig(level=logging.INFO) | |
def main(): | |
pdf_dir = "/home/user/app/test_pdf" | |
output_dir = "/home/user/app/pdf_output" | |
os.makedirs(output_dir, exist_ok=True) | |
pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf")) | |
logging.info(f"Found {len(pdf_files)} PDF files to process") | |
process_batch_in_parallel( | |
pdf_paths=pdf_files, | |
output_dir=output_dir, | |
num_workers=2, #for our T4 small specifically, do not change it | |
num_gpus=1 | |
) | |
if __name__ == "__main__": | |
mp.set_start_method("spawn", force=True) | |
main() |