Spaces:
Running
Running
Update ocr_engine.py
Browse files- ocr_engine.py +17 -18
ocr_engine.py
CHANGED
@@ -3,16 +3,20 @@ from PIL import Image, ImageFilter
|
|
3 |
import torch
|
4 |
import re
|
5 |
|
6 |
-
# Load model
|
7 |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
8 |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
9 |
|
10 |
def clean_ocr_text(text):
|
11 |
-
print("[RAW OCR]", text)
|
12 |
text = text.replace(",", ".").replace("s", "5").replace("o", "0").replace("O", "0")
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def restore_decimal(text):
|
18 |
if re.fullmatch(r"\d{5}", text):
|
@@ -32,20 +36,15 @@ def extract_weight(image):
|
|
32 |
|
33 |
cleaned = clean_ocr_text(raw_text)
|
34 |
|
35 |
-
|
36 |
-
match = re.search(r"(\d{1,3}\.\d{1,3})\s*(kg|g)", cleaned)
|
37 |
if match:
|
38 |
-
return f"{match.group(1)} {match.group(2)}"
|
39 |
-
|
40 |
-
# Try fallback: extract digits and manually guess decimal
|
41 |
-
fallback_match = re.search(r"(\d{4,5})", cleaned)
|
42 |
-
if fallback_match:
|
43 |
-
fallback_value = restore_decimal(fallback_match.group(1))
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
48 |
|
49 |
-
return f"No valid weight found | OCR: {cleaned}"
|
50 |
except Exception as e:
|
51 |
-
return f"Error: {str(e)}"
|
|
|
3 |
import torch
|
4 |
import re
|
5 |
|
|
|
6 |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
7 |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
8 |
|
9 |
def clean_ocr_text(text):
|
|
|
10 |
text = text.replace(",", ".").replace("s", "5").replace("o", "0").replace("O", "0")
|
11 |
+
return re.sub(r"[^\d.kg]", "", text.lower())
|
12 |
+
|
13 |
+
def extract_unit_from_text(raw_text):
|
14 |
+
raw = raw_text.lower()
|
15 |
+
if "kg" in raw:
|
16 |
+
return "kg"
|
17 |
+
elif "g" in raw:
|
18 |
+
return "g"
|
19 |
+
return "g" # default fallback
|
20 |
|
21 |
def restore_decimal(text):
|
22 |
if re.fullmatch(r"\d{5}", text):
|
|
|
36 |
|
37 |
cleaned = clean_ocr_text(raw_text)
|
38 |
|
39 |
+
match = re.search(r"(\d{1,3}\.\d{1,3})\s*(kg|g)?", cleaned)
|
|
|
40 |
if match:
|
41 |
+
return f"{match.group(1)} {match.group(2) or ''}", raw_text
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
+
fallback = re.search(r"\d{4,5}", cleaned)
|
44 |
+
if fallback:
|
45 |
+
fixed = restore_decimal(fallback.group())
|
46 |
+
return f"{fixed}", raw_text
|
47 |
|
48 |
+
return f"No valid weight found | OCR: {cleaned}", raw_text
|
49 |
except Exception as e:
|
50 |
+
return f"Error: {str(e)}", ""
|