SkyNait commited on
Commit
67ee2ac
·
1 Parent(s): a00a164

aws update + image description

Browse files
__pycache__/inference_svm_model.cpython-310.pyc CHANGED
Binary files a/__pycache__/inference_svm_model.cpython-310.pyc and b/__pycache__/inference_svm_model.cpython-310.pyc differ
 
__pycache__/mineru_single.cpython-310.pyc CHANGED
Binary files a/__pycache__/mineru_single.cpython-310.pyc and b/__pycache__/mineru_single.cpython-310.pyc differ
 
__pycache__/worker.cpython-310.pyc CHANGED
Binary files a/__pycache__/worker.cpython-310.pyc and b/__pycache__/worker.cpython-310.pyc differ
 
download_models_hf.py CHANGED
@@ -60,7 +60,7 @@ if __name__ == '__main__':
60
  json_mods = {
61
  'models-dir': model_dir,
62
  'layoutreader-model-dir': layoutreader_model_dir,
63
- 'device-mode': 'cuda'
64
  }
65
 
66
  download_and_modify_json(json_url, config_file, json_mods)
 
60
  json_mods = {
61
  'models-dir': model_dir,
62
  'layoutreader-model-dir': layoutreader_model_dir,
63
+ 'device-mode': 'cuda',
64
  }
65
 
66
  download_and_modify_json(json_url, config_file, json_mods)
mineru.log ADDED
File without changes
mineru_single.py CHANGED
@@ -6,52 +6,52 @@ import requests
6
  import logging
7
  import torch
8
  import gc
9
-
10
  from magic_pdf.data.dataset import PymuDocDataset
11
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
12
  from magic_pdf.data.io.s3 import S3Writer
13
  from magic_pdf.data.data_reader_writer.base import DataWriter
14
-
15
  from inference_svm_model import SVMModel
 
 
 
16
 
17
  logging.basicConfig(
18
  level=logging.INFO,
19
- format="%(asctime)s [%(levelname)s] %(name)s - %(message)s"
 
 
 
 
20
  )
 
21
  logger = logging.getLogger(__name__)
 
22
 
23
  class Processor:
24
  def __init__(self):
25
  try:
26
- self.s3_writer = S3Writer(
27
  ak=os.getenv("S3_ACCESS_KEY"),
28
  sk=os.getenv("S3_SECRET_KEY"),
29
  bucket=os.getenv("S3_BUCKET_NAME"),
30
  endpoint_url=os.getenv("S3_ENDPOINT"),
31
  )
32
-
33
  # self.svm_model = SVMModel()
34
  # logger.info("Classification model initialized successfully")
35
-
36
  with open("/home/user/magic-pdf.json", "r") as f:
37
  config = json.load(f)
38
-
39
  # self.layout_mode = "doclayout_yolo"
40
-
41
  self.layout_mode = config["layout-config"]["model"]
42
  self.formula_enable = config["formula-config"]["enable"]
43
- self.table_enable = config["table-config"]["enable"]
44
  self.language = "en"
45
-
46
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
47
  bucket = os.getenv("S3_BUCKET_NAME", "")
48
- self.prefix = "/document-extracts/"
49
-
50
  logger.info("Processor initialized successfully")
51
  except Exception as e:
52
  logger.error("Failed to initialize Processor: %s", str(e))
53
  raise
54
-
55
  def cleanup_gpu(self):
56
  """
57
  Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
@@ -63,22 +63,18 @@ class Processor:
63
  logger.info("GPU memory cleaned up.")
64
  except Exception as e:
65
  logger.error("Error during GPU cleanup: %s", e)
66
-
67
  def process(self, file_url: str, key: str) -> str:
68
  """
69
  Process a single PDF, returning final Markdown with irrelevant images removed.
70
  """
71
  logger.info("Processing file: %s", file_url)
72
-
73
  try:
74
  response = requests.get(file_url)
75
  if response.status_code != 200:
76
  logger.error("Failed to download PDF from %s. Status code: %d", file_url, response.status_code)
77
  raise Exception(f"Failed to download PDF: {file_url}")
78
-
79
  pdf_bytes = response.content
80
  logger.info("Downloaded %d bytes for file_url='%s'", len(pdf_bytes), file_url)
81
-
82
  # Analyze PDF with OCR
83
  dataset = PymuDocDataset(pdf_bytes)
84
  inference = doc_analyze(
@@ -90,60 +86,136 @@ class Processor:
90
  table_enable=self.table_enable
91
  )
92
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
93
-
94
  # Classify images and remove irrelevant ones
95
  # image_writer = ImageWriter(self.s3_writer)
96
- image_base_path = f"{self.prefix}{key}/" # Construct base path
97
- image_writer = ImageWriter(self.s3_writer, image_base_path) # Pass base path to ImageWriter
98
-
99
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
100
  logger.info("OCR pipeline completed for key='%s'.", key)
101
-
102
- md_content = pipe_result.get_markdown(self.prefix + key + "/")
103
- final_markdown = image_writer.remove_redundant_images(md_content)
104
  logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
105
  return final_markdown
106
  finally:
107
  # GPU memory is cleaned up after each processing.
108
  self.cleanup_gpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  class ImageWriter(DataWriter):
111
  """
112
  Receives each extracted image. Classifies it, uploads if relevant, or flags
113
  it for removal if irrelevant.
114
  """
115
- def __init__(self, s3_writer: S3Writer, base_path: str):
116
  self.s3_writer = s3_writer
117
  self.base_path = base_path
118
  # self.svm_model = svm_model
119
  self._redundant_images_paths = []
120
-
 
 
 
 
 
 
 
 
 
121
  def write(self, path: str, data: bytes) -> None:
122
  """
123
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
124
  """
125
- # full_path = os.path.join(self.prefix, path)
126
- full_path = f"{self.base_path}{path}"
127
  self.s3_writer.write(full_path, data)
128
- # label_str = self.svm_model.classify_image(data)
129
-
130
- # if label_str == 1:
131
-
132
- # else:
133
- # self._redundant_images_paths.append(path)
134
-
135
- def remove_redundant_images(self, md_content: str) -> str:
136
- for path in self._redundant_images_paths:
137
- md_content = md_content.replace(f"![]({path})", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return md_content
139
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  if __name__ == "__main__":
141
  processor = Processor()
142
-
143
- single_url = "https://example.com/somefile.pdf"
144
- markdown_result = processor.process(single_url)
145
- print("Single file Markdown:\n", markdown_result)
146
-
147
- multiple_urls = ["https://example.com/file1.pdf", "https://example.com/file2.pdf"]
148
- batch_results = processor.process_batch(multiple_urls)
149
- print("Batch results:", batch_results)
 
6
  import logging
7
  import torch
8
  import gc
 
9
  from magic_pdf.data.dataset import PymuDocDataset
10
  from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
11
  from magic_pdf.data.io.s3 import S3Writer
12
  from magic_pdf.data.data_reader_writer.base import DataWriter
 
13
  from inference_svm_model import SVMModel
14
+ import concurrent.futures
15
+ import boto3
16
+ from io import BytesIO
17
 
18
  logging.basicConfig(
19
  level=logging.INFO,
20
+ format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
21
+ handlers=[
22
+ logging.StreamHandler(), # This will output to console
23
+ logging.FileHandler('mineru.log') # This will save to a file
24
+ ]
25
  )
26
+
27
  logger = logging.getLogger(__name__)
28
+ logger.setLevel(logging.INFO) # Ensure logger level is set to INFO
29
 
30
  class Processor:
31
  def __init__(self):
32
  try:
33
+ self.s3_writer = s3Writer(
34
  ak=os.getenv("S3_ACCESS_KEY"),
35
  sk=os.getenv("S3_SECRET_KEY"),
36
  bucket=os.getenv("S3_BUCKET_NAME"),
37
  endpoint_url=os.getenv("S3_ENDPOINT"),
38
  )
 
39
  # self.svm_model = SVMModel()
40
  # logger.info("Classification model initialized successfully")
 
41
  with open("/home/user/magic-pdf.json", "r") as f:
42
  config = json.load(f)
 
43
  # self.layout_mode = "doclayout_yolo"
 
44
  self.layout_mode = config["layout-config"]["model"]
45
  self.formula_enable = config["formula-config"]["enable"]
46
+ self.table_enable = False
47
  self.language = "en"
 
48
  endpoint = os.getenv("S3_ENDPOINT", "").rstrip("/")
49
  bucket = os.getenv("S3_BUCKET_NAME", "")
50
+ self.prefix = "document-extracts/"
 
51
  logger.info("Processor initialized successfully")
52
  except Exception as e:
53
  logger.error("Failed to initialize Processor: %s", str(e))
54
  raise
 
55
  def cleanup_gpu(self):
56
  """
57
  Releases GPU memory, use garbage collection to clear PyTorch's CUDA cache.
 
63
  logger.info("GPU memory cleaned up.")
64
  except Exception as e:
65
  logger.error("Error during GPU cleanup: %s", e)
 
66
  def process(self, file_url: str, key: str) -> str:
67
  """
68
  Process a single PDF, returning final Markdown with irrelevant images removed.
69
  """
70
  logger.info("Processing file: %s", file_url)
 
71
  try:
72
  response = requests.get(file_url)
73
  if response.status_code != 200:
74
  logger.error("Failed to download PDF from %s. Status code: %d", file_url, response.status_code)
75
  raise Exception(f"Failed to download PDF: {file_url}")
 
76
  pdf_bytes = response.content
77
  logger.info("Downloaded %d bytes for file_url='%s'", len(pdf_bytes), file_url)
 
78
  # Analyze PDF with OCR
79
  dataset = PymuDocDataset(pdf_bytes)
80
  inference = doc_analyze(
 
86
  table_enable=self.table_enable
87
  )
88
  logger.info("doc_analyze complete for key='%s'. Started extracting images...", key)
 
89
  # Classify images and remove irrelevant ones
90
  # image_writer = ImageWriter(self.s3_writer)
91
+ image_writer = ImageWriter(self.s3_writer, f"{self.prefix}{key}/") # Pass base path to ImageWriter
 
 
92
  pipe_result = inference.pipe_ocr_mode(image_writer, lang=self.language)
93
  logger.info("OCR pipeline completed for key='%s'.", key)
94
+ md_content = pipe_result.get_markdown(f"{self.prefix}{key}/")
95
+ final_markdown = image_writer.post_process(f"{self.prefix}{key}/",md_content)
 
96
  logger.info("Completed PDF process for key='%s'. Final MD length=%d", key, len(final_markdown))
97
  return final_markdown
98
  finally:
99
  # GPU memory is cleaned up after each processing.
100
  self.cleanup_gpu()
101
+ class s3Writer:
102
+ def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
103
+ self.bucket = bucket
104
+ self.client = boto3.client('s3',
105
+ aws_access_key_id=ak,
106
+ aws_secret_access_key=sk,
107
+ endpoint_url=endpoint_url
108
+ )
109
+
110
+ def write(self, path: str, data: bytes) -> None:
111
+ """Upload data to S3 using proper keyword arguments"""
112
+ try:
113
+ # Convert bytes to file-like object
114
+ file_obj = BytesIO(data)
115
+
116
+ # Upload using upload_fileobj
117
+ self.client.upload_fileobj(
118
+ file_obj,
119
+ self.bucket,
120
+ path
121
+ )
122
+ except Exception as e:
123
+ logger.error(f"Failed to upload to S3: {str(e)}")
124
+ raise
125
 
126
  class ImageWriter(DataWriter):
127
  """
128
  Receives each extracted image. Classifies it, uploads if relevant, or flags
129
  it for removal if irrelevant.
130
  """
131
+ def __init__(self, s3_writer: s3Writer, base_path: str):
132
  self.s3_writer = s3_writer
133
  self.base_path = base_path
134
  # self.svm_model = svm_model
135
  self._redundant_images_paths = []
136
+ self.descriptions = {}
137
+ """
138
+ {
139
+ "{path}": {
140
+ "description": "{description}",
141
+ "full_path": "{full_path}"
142
+ }
143
+ }
144
+ """
145
+
146
  def write(self, path: str, data: bytes) -> None:
147
  """
148
  Called for each extracted image. If relevant, upload to S3; otherwise mark for removal.
149
  """
150
+ full_path = f"{self.base_path}" + path.split("/")[-1]
 
151
  self.s3_writer.write(full_path, data)
152
+ self.descriptions[path] = {
153
+ "data": data,
154
+ "full_path": full_path
155
+ }
156
+
157
+ def post_process(self, key: str, md_content: str) -> str:
158
+ max_workers = len(self.descriptions)
159
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
160
+ future_to_file = {
161
+ executor.submit(
162
+ call_gemini_for_image_description,
163
+ self.descriptions[path]['data']
164
+ ): path for path in self.descriptions.keys()
165
+ }
166
+ for future in concurrent.futures.as_completed(future_to_file):
167
+ path = future_to_file[future]
168
+ try:
169
+ description = future.result()
170
+ if description:
171
+ self.descriptions[path]['description'] = description
172
+ except Exception as e:
173
+ logger.error(f"[ERROR] Processing {path}: {str(e)}")
174
+
175
+ for path, info in self.descriptions.items():
176
+ description = info['description']
177
+ full_path = info['full_path']
178
+ md_content = md_content.replace(f"![]({key}{path})", f"![{description}]({full_path})")
179
  return md_content
180
+
181
+ def call_gemini_for_image_description(image_data: bytes) -> str:
182
+ """Convert image bytes to Gemini-compatible format and get description"""
183
+ from google import genai
184
+ import base64
185
+
186
+ try:
187
+ # Initialize Gemini client
188
+ client = genai.Client(api_key="AIzaSyDtoakpXa2pjJwcQB6TJ5QaXHNSA5JxcrU")
189
+
190
+ # Generate content with proper image format
191
+ response = client.models.generate_content(
192
+ model="gemini-2.0-flash",
193
+ contents=[
194
+ {
195
+ "parts": [
196
+ {"text": """The provided image is a part of a question paper or markscheme. Extract all the necessary information from the image to be able to identify the question.
197
+ For example, if there is an image that contains text like: "Q1 Part A Answer: Life on earth was created by diety..." you should return "Q1 Part A Answer"
198
+ If there is no text on this image, return the description of the image. 20 words max."""},
199
+ {
200
+ "inline_data": {
201
+ "mime_type": "image/jpeg",
202
+ "data": base64.b64encode(image_data).decode('utf-8')
203
+ }
204
+ }
205
+ ]
206
+ }
207
+ ]
208
+ )
209
+
210
+ # Get the response text
211
+ description = response.text.strip() if response and response.text else "Image description unavailable"
212
+ return description
213
+
214
+ except Exception as e:
215
+ logger.error(f"Error getting image description: {str(e)}")
216
+ return ("error", "Error describing image", None)
217
  if __name__ == "__main__":
218
  processor = Processor()
219
+ single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
220
+ markdown_result = processor.process(single_url, key="1234323")
221
+ print("Single file Markdown:\n", markdown_result)