SkyNait commited on
Commit
dbd3d89
·
1 Parent(s): b529b1d
Files changed (1) hide show
  1. mineru_single.py +14 -13
mineru_single.py CHANGED
@@ -30,22 +30,22 @@ class Processor:
30
  endpoint_url=os.getenv("S3_ENDPOINT"),
31
  )
32
 
33
- self.svm_model = SVMModel()
34
- logger.info("Classification model initialized successfully")
35
 
36
  with open("/home/user/magic-pdf.json", "r") as f:
37
  config = json.load(f)
38
 
39
- self.layout_mode = "doclayout_yolo"
40
 
41
- # self.layout_mode = config["layout-config"]["model"]
42
  self.formula_enable = config["formula-config"]["enable"]
43
  self.table_enable = config["table-config"]["enable"]
44
  self.language = "en"
45
 
46
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
47
  bucket = os.getenv("S3_BUCKET_NAME", "")
48
- self.prefix = f"{endpoint}/{bucket}/document-extracts/"
49
 
50
  logger.info("Processor initialized successfully")
51
  except Exception as e:
@@ -92,7 +92,7 @@ class Processor:
92
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
93
 
94
  # Classify images and remove irrelevant ones
95
- image_writer = ImageWriter(self.s3_writer, self.svm_model)
96
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
97
  logger.info("OCR pipeline completed for key='%s'.", key)
98
 
@@ -109,21 +109,22 @@ class ImageWriter(DataWriter):
109
  Receives each extracted image. Classifies it, uploads if relevant, or flags
110
  it for removal if irrelevant.
111
  """
112
- def __init__(self, s3_writer: S3Writer, svm_model: SVMModel):
113
  self.s3_writer = s3_writer
114
- self.svm_model = svm_model
115
  self._redundant_images_paths = []
116
 
117
  def write(self, path: str, data: bytes) -> None:
118
  """
119
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
120
  """
121
- label_str = self.svm_model.classify_image(data)
 
122
 
123
- if label_str == 1:
124
- self.s3_writer.write(path, data)
125
- else:
126
- self._redundant_images_paths.append(path)
127
 
128
  def remove_redundant_images(self, md_content: str) -> str:
129
  for path in self._redundant_images_paths:
 
30
  endpoint_url=os.getenv("S3_ENDPOINT"),
31
  )
32
 
33
+ # self.svm_model = SVMModel()
34
+ # logger.info("Classification model initialized successfully")
35
 
36
  with open("/home/user/magic-pdf.json", "r") as f:
37
  config = json.load(f)
38
 
39
+ # self.layout_mode = "doclayout_yolo"
40
 
41
+ self.layout_mode = config["layout-config"]["model"]
42
  self.formula_enable = config["formula-config"]["enable"]
43
  self.table_enable = config["table-config"]["enable"]
44
  self.language = "en"
45
 
46
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
47
  bucket = os.getenv("S3_BUCKET_NAME", "")
48
+ self.prefix = f"/document-extracts/"
49
 
50
  logger.info("Processor initialized successfully")
51
  except Exception as e:
 
92
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
93
 
94
  # Classify images and remove irrelevant ones
95
+ image_writer = ImageWriter(self.s3_writer)
96
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
97
  logger.info("OCR pipeline completed for key='%s'.", key)
98
 
 
109
  Receives each extracted image. Classifies it, uploads if relevant, or flags
110
  it for removal if irrelevant.
111
  """
112
+ def __init__(self, s3_writer: S3Writer):
113
  self.s3_writer = s3_writer
114
+ # self.svm_model = svm_model
115
  self._redundant_images_paths = []
116
 
117
  def write(self, path: str, data: bytes) -> None:
118
  """
119
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
120
  """
121
+ self.s3_writer.write(path, data)
122
+ # label_str = self.svm_model.classify_image(data)
123
 
124
+ # if label_str == 1:
125
+
126
+ # else:
127
+ # self._redundant_images_paths.append(path)
128
 
129
  def remove_redundant_images(self, md_content: str) -> str:
130
  for path in self._redundant_images_paths: