brestok commited on
Commit
b28ce62
·
0 Parent(s):
Files changed (1) hide show
  1. ocr/api/message/utils.py +53 -0
ocr/api/message/utils.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import re
4
+
5
+ import pytesseract
6
+ from PIL import Image
7
+ from pdf2image import convert_from_bytes
8
+
9
+
10
+ def divide_images(contents: bytes) -> list[bytes]:
11
+ images = convert_from_bytes(contents, dpi=250)
12
+ image_bytes_list = []
13
+ for image in images:
14
+ img_byte_array = io.BytesIO()
15
+ image.save(img_byte_array, format='PNG')
16
+ img_byte_array.seek(0)
17
+ image_bytes_list.append(img_byte_array.read())
18
+ return image_bytes_list
19
+
20
+
21
+ def extract_text_from_images(images: list[bytes]) -> str:
22
+ extracted_texts = []
23
+
24
+ for image_bytes in images:
25
+ image = Image.open(io.BytesIO(image_bytes))
26
+ text = pytesseract.image_to_string(image)
27
+ extracted_texts.append(text)
28
+
29
+ return '\n'.join(extracted_texts)
30
+
31
+
32
+ def clean_response(text: str) -> str:
33
+ try:
34
+ text = re.search(r'```markdown\s*(.*?)\s*```', text, re.DOTALL).group(1)
35
+ except Exception as e:
36
+ pass
37
+ return text
38
+
39
+
40
+ def prepare_request_content(images: list[bytes]):
41
+ content = [
42
+ {"type": "text", "text": "Generate a report on the attached document"},
43
+ *[
44
+ {
45
+ "type": "image_url",
46
+ "image_url": {
47
+ "url": f"data:image/jpeg;base64,{base64.b64encode(image).decode('utf-8')}",
48
+ },
49
+ }
50
+ for image in images
51
+ ]
52
+ ]
53
+ return content