Changed image processing
Browse files- mineru_single.py +19 -2
- model_classification/svm_model.joblib +2 -2
mineru_single.py
CHANGED
@@ -98,6 +98,8 @@ class Processor:
|
|
98 |
finally:
|
99 |
# GPU memory is cleaned up after each processing.
|
100 |
self.cleanup_gpu()
|
|
|
|
|
101 |
class s3Writer:
|
102 |
def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
|
103 |
self.bucket = bucket
|
@@ -123,6 +125,7 @@ class s3Writer:
|
|
123 |
logger.error(f"Failed to upload to S3: {str(e)}")
|
124 |
raise
|
125 |
|
|
|
126 |
class ImageWriter(DataWriter):
|
127 |
"""
|
128 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
@@ -177,7 +180,8 @@ class ImageWriter(DataWriter):
|
|
177 |
full_path = info['full_path']
|
178 |
md_content = md_content.replace(f"", f"")
|
179 |
return md_content
|
180 |
-
|
|
|
181 |
def call_gemini_for_image_description(image_data: bytes) -> str:
|
182 |
"""Convert image bytes to Gemini-compatible format and get description"""
|
183 |
from google import genai
|
@@ -206,6 +210,10 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
206 |
If there are not enough data, consider information from the surrounding context.
|
207 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
208 |
part of some another image that goes before or after current image.
|
|
|
|
|
|
|
|
|
209 |
"""},
|
210 |
{
|
211 |
"inline_data": {
|
@@ -225,8 +233,17 @@ def call_gemini_for_image_description(image_data: bytes) -> str:
|
|
225 |
except Exception as e:
|
226 |
logger.error(f"Error getting image description: {str(e)}")
|
227 |
return ("error", "Error describing image", None)
|
|
|
|
|
228 |
if __name__ == "__main__":
|
229 |
processor = Processor()
|
230 |
single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
231 |
markdown_result = processor.process(single_url, key="1234323")
|
232 |
-
print("Single file Markdown:\n", markdown_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
finally:
|
99 |
# GPU memory is cleaned up after each processing.
|
100 |
self.cleanup_gpu()
|
101 |
+
|
102 |
+
|
103 |
class s3Writer:
|
104 |
def __init__(self, ak: str, sk: str, bucket: str, endpoint_url: str):
|
105 |
self.bucket = bucket
|
|
|
125 |
logger.error(f"Failed to upload to S3: {str(e)}")
|
126 |
raise
|
127 |
|
128 |
+
|
129 |
class ImageWriter(DataWriter):
|
130 |
"""
|
131 |
Receives each extracted image. Classifies it, uploads if relevant, or flags
|
|
|
180 |
full_path = info['full_path']
|
181 |
md_content = md_content.replace(f"", f"")
|
182 |
return md_content
|
183 |
+
|
184 |
+
|
185 |
def call_gemini_for_image_description(image_data: bytes) -> str:
|
186 |
"""Convert image bytes to Gemini-compatible format and get description"""
|
187 |
from google import genai
|
|
|
210 |
If there are not enough data, consider information from the surrounding context.
|
211 |
Additionally, if the image contains a truncated part, you must describe it and mark as a
|
212 |
part of some another image that goes before or after current image.
|
213 |
+
|
214 |
+
If an image is a multiple-choice question's options, make sure to modify your answer to add
|
215 |
+
"MCQ: A option B option C option D option", where MCQ is a descriptor and "option" would be
|
216 |
+
replaced with actual option from image.
|
217 |
"""},
|
218 |
{
|
219 |
"inline_data": {
|
|
|
233 |
except Exception as e:
|
234 |
logger.error(f"Error getting image description: {str(e)}")
|
235 |
return ("error", "Error describing image", None)
|
236 |
+
|
237 |
+
|
238 |
if __name__ == "__main__":
|
239 |
processor = Processor()
|
240 |
single_url = "https://quextro-resources.s3.eu-west-2.amazonaws.com/1739967958667-643657-mark-scheme-computer-principles.pdf?response-content-disposition=inline&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEJT%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCWV1LXdlc3QtMiJGMEQCIARfSyuot0h2RNrcqVQkc2T%2B1fJZ64NfjmkmAFgCkTG6AiArmbJDAUr7T85HdqAT2RbyLhmiIgpSo3ci4%2FUtSap2wCrUAwi8%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F8BEAAaDDUwOTM5OTYxODAzMCIMkfFm%2FgBrHsH1qh59KqgDjfZd1%2BKGzxkn7JorfQ07dL%2BL5fjCA6kmNAzCnCjDpTLnNjBfB1vnO2ZLvtC8RNvnaewY6tFWUfl39dC62ldnfajHeFmxkZqBcbDf3oOGnuO2PIvBgb5%2BvppVDkYjWz7vv5TzpgC2sVzjA38QMwxAnausYWDgspap7qjlfoLJUiBOq9SIMZyKVsfeAf4OiUl0TDc2nheqvNXOJy9TPh94KWbBT35vP3fU9A7ZdF4sElm4nVZMnOPdbR7%2Ba6F57nPLZvUaLZC5Nb011ef6%2BhAxr9yeONh5MAoTGUH2qzedDmN%2FbKannddBy%2FNIaP%2BhF7lWUkKemQrM5vajwU6k2Q45pLruKWRkjtrWxdmkQE4zb67ETj5eGL%2BlPPj%2BPtQWzF7UaoWPUH4tGBZ%2Bqdu479rU1ZSg%2B15lR%2F8SAgP%2BydATGwyRtXEvMRJZIiUems8i6ehxWC%2FscY2%2FtCk9OREKhLwOEEdJDAR4vqt68lnnvVomHrVjwNQvyP9A4V8Ct%2B0SjxP%2F86kJnX3o%2FVEoFT44JWICuMuf8kwoelUbZGPl6SaftGsRSUvoy7PV5TCN3du9BjrlAjKhLpjsCwgp1rJ8cPBFcUgOmL3iXrtHs3FhDLljxbXRZ%2FadHkxAlzf%2BXym%2BFBnhdCkDfmWcMEH3GAOFfv%2FlE5SsZMO1JoXbzQlO3OX6nrUacj7LF7ZoO8TYMVoTyEZSLEABNOU7KCILaFeDGRDJ8Ia5I3jnXvOVouFn2VnhykCuWPTunjkMEQBiHa3mbZP0mVcSviujHXatN11INiR%2BPwAN5oxKXeT25B%2FCCI3wib5Av2tzp8zuw8joib5PWNXOYfRgMR7R0Sj%2FjW5SxWr%2BTD9TAD3%2Fqj5pj3Oo13dNGdv5RwGqk1iHd8okpkFYlxEmXD2tTanpxX8ON1%2FLHz%2BNEUJDOogx8TLw5I6mkVs3zjoMhhwn2%2BWrlnNa%2F3i9lAGyLY6Ps4U23Hv7b4gpH4%2BeJN72Z95hrNtcumq4uuf0pRoJPQ9pjiZttjeDwNZzb7d3XuiEQeOgK8rpTeEgduxhdJOOLwZGrg%3D%3D&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAXNGUVKHXFLYKHBHD%2F20250220%2Feu-west-2%2Fs3%2Faws4_request&X-Amz-Date=20250220T111935Z&X-Amz-Expires=10800&X-Amz-SignedHeaders=host&X-Amz-Signature=64aa008fdafe72f1a693078156451c0f6f702e89e546954d6b3d61abf9f73ec8"
|
241 |
markdown_result = processor.process(single_url, key="1234323")
|
242 |
+
print("Single file Markdown:\n", markdown_result)
|
243 |
+
|
244 |
+
# if __name__ == "__main__":
|
245 |
+
# with open("./test_image.jpg", "rb") as file:
|
246 |
+
# test_image = file.read()
|
247 |
+
|
248 |
+
# print(call_gemini_for_image_description(test_image))
|
249 |
+
|
model_classification/svm_model.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70dd9281be75d0e9d3889bb48ad65088344b83d9d7c33c682a012e5468440e1f
|
3 |
+
size 263076507
|