Sanjayraju30 commited on
Commit
1a4ff0a
·
verified ·
1 Parent(s): 844e1ad

Update src/ocr_engine.py

Browse files
Files changed (1) hide show
  1. src/ocr_engine.py +22 -10
src/ocr_engine.py CHANGED
@@ -1,10 +1,22 @@
1
- import pytesseract
2
- from PIL import Image
3
- import re
4
-
5
- def extract_weight_from_image(image):
6
- text = pytesseract.image_to_string(image)
7
- match = re.search(r"\d+\.\d{2}", text)
8
- if match:
9
- return match.group()
10
- return "No weight found"
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def extract_weight_from_image(pil_img):
2
+ import os
3
+ os.system("apt-get update && apt-get install -y libgl1-mesa-glx")
4
+
5
+ import cv2
6
+ import numpy as np
7
+ import pytesseract
8
+ from PIL import Image
9
+
10
+ image = np.array(pil_img.convert("RGB"))
11
+ gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
12
+ blur = cv2.GaussianBlur(gray, (3, 3), 0)
13
+ _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
14
+
15
+ config = '--psm 7 -c tessedit_char_whitelist=0123456789.'
16
+ data = pytesseract.image_to_data(thresh, config=config, output_type=pytesseract.Output.DICT)
17
+
18
+ extracted_text = ''.join(filter(lambda x: x in '0123456789.', ''.join(data['text'])))
19
+ confidences = [int(conf) for conf in data['conf'] if conf.isdigit()]
20
+ avg_conf = sum(confidences)/len(confidences) if confidences else 0
21
+
22
+ return extracted_text.strip(), avg_conf