zliang commited on
Commit
0ce080b
·
verified ·
1 Parent(s): d5da17c

Update pdfextract_fun.py

Browse files
Files changed (1) hide show
  1. pdfextract_fun.py +10 -11
pdfextract_fun.py CHANGED
@@ -50,17 +50,16 @@ def convert_pdf_to_jpg(pdf_path, output_folder, zoom_factor=2):
50
  pix.save(output_file)
51
 
52
  # Process JPEG images in a folder
53
- def process_jpeg_images((output_folder, cfg, batch_size=10):
54
- image_paths = [os.path.join(output_folder, f) for f in os.listdir(output_folder) if f.endswith('.jpg')]
55
- batches = [image_paths[i:i + batch_size] for i in range(0, len(image_paths), batch_size)]
56
-
57
- for batch in tqdm(batches, desc="Processing images in batches"):
58
- images = [cv2.imread(image_path) for image_path in batch]
59
- batch_results = batch_analyze_images(images, cfg) # This function needs to be implemented to support batch processing
60
-
61
- for i, (result_image, output, v) in enumerate(batch_results):
62
- # Assuming batch_analyze_images returns a list of tuples, each containing the results for one image
63
- save_extracted_instances(images[i], output, i, output_folder)
64
 
65
  # Save extracted instances
66
  def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):
 
50
  pix.save(output_file)
51
 
52
  # Process JPEG images in a folder
53
+ def process_jpeg_images(output_folder, cfg):
54
+ """Process each JPEG image in the output folder."""
55
+ for page_num in tqdm(range(len(os.listdir(output_folder))), desc="Processing the pdf"):
56
+ file_path = os.path.join(output_folder, f"page_{page_num}.jpg")
57
+ img = cv2.imread(file_path)
58
+ if img is None:
59
+ print(f"Failed to read {file_path}. Skipping.")
60
+ continue
61
+ result_image, output, v = analyze_image(img, cfg)
62
+ save_extracted_instances(img, output, page_num, output_folder)
 
63
 
64
  # Save extracted instances
65
  def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):