SkyNait commited on
Commit
a00a164
·
verified ·
1 Parent(s): dbd3d89

fix S3 bucket image upload

Browse files
Files changed (1) hide show
  1. mineru_single.py +10 -4
mineru_single.py CHANGED
@@ -45,7 +45,7 @@ class Processor:
45
 
46
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
47
  bucket = os.getenv("S3_BUCKET_NAME", "")
48
- self.prefix = f"/document-extracts/"
49
 
50
  logger.info("Processor initialized successfully")
51
  except Exception as e:
@@ -92,7 +92,10 @@ class Processor:
92
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
93
 
94
  # Classify images and remove irrelevant ones
95
- image_writer = ImageWriter(self.s3_writer)
 
 
 
96
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
97
  logger.info("OCR pipeline completed for key='%s'.", key)
98
 
@@ -109,8 +112,9 @@ class ImageWriter(DataWriter):
109
  Receives each extracted image. Classifies it, uploads if relevant, or flags
110
  it for removal if irrelevant.
111
  """
112
- def __init__(self, s3_writer: S3Writer):
113
  self.s3_writer = s3_writer
 
114
  # self.svm_model = svm_model
115
  self._redundant_images_paths = []
116
 
@@ -118,7 +122,9 @@ class ImageWriter(DataWriter):
118
  """
119
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
120
  """
121
- self.s3_writer.write(path, data)
 
 
122
  # label_str = self.svm_model.classify_image(data)
123
 
124
  # if label_str == 1:
 
45
 
46
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
47
  bucket = os.getenv("S3_BUCKET_NAME", "")
48
+ self.prefix = "/document-extracts/"
49
 
50
  logger.info("Processor initialized successfully")
51
  except Exception as e:
 
92
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
93
 
94
  # Classify images and remove irrelevant ones
95
+ # image_writer = ImageWriter(self.s3_writer)
96
+ image_base_path = f"{self.prefix}{key}/" # Construct base path
97
+ image_writer = ImageWriter(self.s3_writer, image_base_path) # Pass base path to ImageWriter
98
+
99
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
100
  logger.info("OCR pipeline completed for key='%s'.", key)
101
 
 
112
  Receives each extracted image. Classifies it, uploads if relevant, or flags
113
  it for removal if irrelevant.
114
  """
115
+ def __init__(self, s3_writer: S3Writer, base_path: str):
116
  self.s3_writer = s3_writer
117
+ self.base_path = base_path
118
  # self.svm_model = svm_model
119
  self._redundant_images_paths = []
120
 
 
122
  """
123
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
124
  """
125
+ # full_path = os.path.join(self.prefix, path)
126
+ full_path = f"{self.base_path}{path}"
127
+ self.s3_writer.write(full_path, data)
128
  # label_str = self.svm_model.classify_image(data)
129
 
130
  # if label_str == 1: