Update pdfextract_fun.py
Browse files- pdfextract_fun.py +10 -11
pdfextract_fun.py
CHANGED
@@ -50,17 +50,16 @@ def convert_pdf_to_jpg(pdf_path, output_folder, zoom_factor=2):
|
|
50 |
pix.save(output_file)
|
51 |
|
52 |
# Process JPEG images in a folder
|
53 |
-
def process_jpeg_images(
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
save_extracted_instances(images[i], output, i, output_folder)
|
64 |
|
65 |
# Save extracted instances
|
66 |
def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):
|
|
|
50 |
pix.save(output_file)
|
51 |
|
52 |
# Process JPEG images in a folder
|
53 |
+
def process_jpeg_images(output_folder, cfg):
|
54 |
+
"""Process each JPEG image in the output folder."""
|
55 |
+
for page_num in tqdm(range(len(os.listdir(output_folder))), desc="Processing the pdf"):
|
56 |
+
file_path = os.path.join(output_folder, f"page_{page_num}.jpg")
|
57 |
+
img = cv2.imread(file_path)
|
58 |
+
if img is None:
|
59 |
+
print(f"Failed to read {file_path}. Skipping.")
|
60 |
+
continue
|
61 |
+
result_image, output, v = analyze_image(img, cfg)
|
62 |
+
save_extracted_instances(img, output, page_num, output_folder)
|
|
|
63 |
|
64 |
# Save extracted instances
|
65 |
def save_extracted_instances(img, output, page_num, dest_folder, confidence_threshold=0.8):
|