File size: 763 Bytes
11551ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python3

import os
import glob
import logging
import torch.multiprocessing as mp
from parallel_multiproc import process_batch_in_parallel

logging.basicConfig(level=logging.INFO)

def main():
    pdf_dir = "/home/user/app/test_pdf"
    output_dir = "/home/user/app/pdf_output"
    os.makedirs(output_dir, exist_ok=True)
    pdf_files = glob.glob(os.path.join(pdf_dir, "*.pdf"))
    logging.info(f"Found {len(pdf_files)} PDF files to process")

    process_batch_in_parallel(
        pdf_paths=pdf_files,
        output_dir=output_dir,
        num_workers=2,  #for our T4 small specifically, do not change it 
        num_gpus=1   
    )

if __name__ == "__main__":
    mp.set_start_method("spawn", force=True)

    main()