Update src/ocr_engine.py
Browse files- src/ocr_engine.py +22 -10
src/ocr_engine.py
CHANGED
@@ -1,10 +1,22 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def extract_weight_from_image(pil_img):
|
2 |
+
import os
|
3 |
+
os.system("apt-get update && apt-get install -y libgl1-mesa-glx")
|
4 |
+
|
5 |
+
import cv2
|
6 |
+
import numpy as np
|
7 |
+
import pytesseract
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
image = np.array(pil_img.convert("RGB"))
|
11 |
+
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
|
12 |
+
blur = cv2.GaussianBlur(gray, (3, 3), 0)
|
13 |
+
_, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
14 |
+
|
15 |
+
config = '--psm 7 -c tessedit_char_whitelist=0123456789.'
|
16 |
+
data = pytesseract.image_to_data(thresh, config=config, output_type=pytesseract.Output.DICT)
|
17 |
+
|
18 |
+
extracted_text = ''.join(filter(lambda x: x in '0123456789.', ''.join(data['text'])))
|
19 |
+
confidences = [int(conf) for conf in data['conf'] if conf.isdigit()]
|
20 |
+
avg_conf = sum(confidences)/len(confidences) if confidences else 0
|
21 |
+
|
22 |
+
return extracted_text.strip(), avg_conf
|