Arsenii11 commited on
Commit
17f9a87
·
1 Parent(s): 458e97c

Changed image processing

Browse files
mineru_single.py CHANGED
@@ -98,6 +98,8 @@ class Processor:
98
  finally:
99
  # GPU memory is cleaned up after each processing.
100
  self.cleanup_gpu()
 
 
101
  class s3Writer:
102
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
103
  self.bucket = bucket
@@ -123,6 +125,7 @@ class s3Writer:
123
  logger.error(f"Failed to upload to S3: {str(e)}")
124
  raise
125
 
 
126
  class ImageWriter(DataWriter):
127
  """
128
  Receives each extracted image. Classifies it, uploads if relevant, or flags
@@ -177,7 +180,8 @@ class ImageWriter(DataWriter):
177
  full_path = info['full_path']
178
  md_content = md_content.replace(f"![]({key}{path})", f"![{description}]({full_path})")
179
  return md_content
180
-
 
181
  def call_gemini_for_image_description(image_data: bytes) -> str:
182
  """Convert image bytes to Gemini-compatible format and get description"""
183
  from google import genai
@@ -206,6 +210,10 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
206
  If there are not enough data, consider information from the surrounding context.
207
  Additionally, if the image contains a truncated part, you must describe it and mark as a
208
  part of some another image that goes before or after current image.
 
 
 
 
209
  """},
210
  {
211
  "inline_data": {
@@ -225,8 +233,17 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
225
  except Exception as e:
226
  logger.error(f"Error getting image description: {str(e)}")
227
  return ("error", "Error describing image", None)
 
 
228
  if __name__ == "__main__":
229
  processor = Processor()
230
  single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
231
  markdown_result = processor.process(single_url, key="1234323")
232
- print("Single file Markdown:\n", markdown_result)
 
 
 
 
 
 
 
 
98
  finally:
99
  # GPU memory is cleaned up after each processing.
100
  self.cleanup_gpu()
101
+
102
+
103
  class s3Writer:
104
  def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
105
  self.bucket = bucket
 
125
  logger.error(f"Failed to upload to S3: {str(e)}")
126
  raise
127
 
128
+
129
  class ImageWriter(DataWriter):
130
  """
131
  Receives each extracted image. Classifies it, uploads if relevant, or flags
 
180
  full_path = info['full_path']
181
  md_content = md_content.replace(f"![]({key}{path})", f"![{description}]({full_path})")
182
  return md_content
183
+
184
+
185
  def call_gemini_for_image_description(image_data: bytes) -> str:
186
  """Convert image bytes to Gemini-compatible format and get description"""
187
  from google import genai
 
210
  If there are not enough data, consider information from the surrounding context.
211
  Additionally, if the image contains a truncated part, you must describe it and mark as a
212
  part of some another image that goes before or after current image.
213
+
214
+ If an image is a multiple-choice question's options, make sure to modify your answer to add
215
+ "MCQ: A option B option C option D option", where MCQ is a descriptor and "option" would be
216
+ replaced with actual option from image.
217
  """},
218
  {
219
  "inline_data": {
 
233
  except Exception as e:
234
  logger.error(f"Error getting image description: {str(e)}")
235
  return ("error", "Error describing image", None)
236
+
237
+
238
  if __name__ == "__main__":
239
  processor = Processor()
240
  single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
241
  markdown_result = processor.process(single_url, key="1234323")
242
+ print("Single file Markdown:\n", markdown_result)
243
+
244
+ # if __name__ == "__main__":
245
+ # with open("./test_image.jpg", "rb") as file:
246
+ # test_image = file.read()
247
+
248
+ # print(call_gemini_for_image_description(test_image))
249
+
model_classification/svm_model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76738c8932b22f2fce7e66fd038a02bf6690f752c36e61ffe024b2bd8158d390
3
- size 271727707
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70dd9281be75d0e9d3889bb48ad65088344b83d9d7c33c682a012e5468440e1f
3
+ size 263076507