mratanusarkar commited on
Commit
3d948a1
·
1 Parent(s): 5406446

temp: attempt - force to png with pillow

Browse files
medrag_multi_modal/document_loader/image_loader/pymupdf_img_loader.py CHANGED
@@ -2,8 +2,8 @@ import os
2
  from typing import Any, Dict
3
 
4
  import fitz
5
- from PIL import Image
6
- from io import BytesIO
7
 
8
  from .base_img_loader import BaseImageLoader
9
 
@@ -76,7 +76,7 @@ class PyMuPDFImageLoader(BaseImageLoader):
76
  image_file_paths = []
77
 
78
  pdf_document = fitz.open(self.document_file_path)
79
- page = pdf_document[page_idx]
80
 
81
  images = page.get_images(full=True)
82
  for img_idx, image in enumerate(images):
@@ -85,33 +85,33 @@ class PyMuPDFImageLoader(BaseImageLoader):
85
  image_bytes = base_image["image"]
86
  image_ext = base_image["ext"]
87
 
88
- if image_ext == "jb2":
89
- image_ext = "png"
90
- elif image_ext == "jpx":
91
- image_ext = "jpg"
92
-
93
- image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
94
- image_file_path = os.path.join(image_save_dir, image_file_name)
95
-
96
- # For JBIG2 and JPEG2000, we need to convert the image
97
- if base_image["ext"] in ["jb2", "jpx"]:
98
- try:
99
- pix = fitz.Pixmap(image_bytes)
100
- pix.save(image_file_path)
101
- except Exception as e:
102
- print(f"Error processing image: {e}")
103
- # Fallback to using PIL for image conversion
104
- try:
105
- img = Image.open(BytesIO(image_bytes))
106
- img.save(image_file_path)
107
- except Exception as e:
108
- print(f"Failed to process image with PIL: {e}")
109
- continue # Skip this image if both methods fail
110
- else:
111
- with open(image_file_path, "wb") as image_file:
112
- image_file.write(image_bytes)
113
-
114
- image_file_paths.append(image_file_path)
115
 
116
  pdf_document.close()
117
 
 
2
  from typing import Any, Dict
3
 
4
  import fitz
5
+ from PIL import Image, ImageOps, UnidentifiedImageError
6
+ import io
7
 
8
  from .base_img_loader import BaseImageLoader
9
 
 
76
  image_file_paths = []
77
 
78
  pdf_document = fitz.open(self.document_file_path)
79
+ page = pdf_document.load_page(page_idx)
80
 
81
  images = page.get_images(full=True)
82
  for img_idx, image in enumerate(images):
 
85
  image_bytes = base_image["image"]
86
  image_ext = base_image["ext"]
87
 
88
+ try:
89
+ img = Image.open(io.BytesIO(image_bytes))
90
+
91
+ if img.mode in ['1', 'P']:
92
+ img = ImageOps.invert(img.convert('L'))
93
+
94
+ if img.mode == 'CMYK':
95
+ img = img.convert('RGB')
96
+
97
+ if image_ext not in ['png', 'jpg', 'jpeg']:
98
+ image_ext = 'png'
99
+ image_file_name = f"page{page_idx}_fig{img_idx}.png"
100
+ image_file_path = os.path.join(image_save_dir, image_file_name)
101
+
102
+ img.save(image_file_path, format="PNG")
103
+ else:
104
+ image_file_name = f"page{page_idx}_fig{img_idx}.{image_ext}"
105
+ image_file_path = os.path.join(image_save_dir, image_file_name)
106
+
107
+ with open(image_file_path, "wb") as image_file:
108
+ image_file.write(image_bytes)
109
+
110
+ image_file_paths.append(image_file_path)
111
+
112
+ except (UnidentifiedImageError, OSError) as e:
113
+ print(f"Skipping image at page {page_idx}, fig {img_idx} due to an error: {e}")
114
+ continue
115
 
116
  pdf_document.close()
117