Sanjayraju30 commited on
Commit
0e76d8f
·
verified ·
1 Parent(s): d07e9f7

Update ocr_engine.py

Browse files
Files changed (1) hide show
  1. ocr_engine.py +17 -18
ocr_engine.py CHANGED
@@ -3,16 +3,20 @@ from PIL import Image, ImageFilter
3
  import torch
4
  import re
5
 
6
- # Load model
7
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
8
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
9
 
10
  def clean_ocr_text(text):
11
- print("[RAW OCR]", text)
12
  text = text.replace(",", ".").replace("s", "5").replace("o", "0").replace("O", "0")
13
- text = re.sub(r"[^\d.kg]", "", text.lower()) # Keep digits, dots, and kg
14
- print("[CLEANED OCR]", text)
15
- return text
 
 
 
 
 
 
16
 
17
  def restore_decimal(text):
18
  if re.fullmatch(r"\d{5}", text):
@@ -32,20 +36,15 @@ def extract_weight(image):
32
 
33
  cleaned = clean_ocr_text(raw_text)
34
 
35
- # Try direct match: e.g., 52.25 kg or 75.0 g
36
- match = re.search(r"(\d{1,3}\.\d{1,3})\s*(kg|g)", cleaned)
37
  if match:
38
- return f"{match.group(1)} {match.group(2)}"
39
-
40
- # Try fallback: extract digits and manually guess decimal
41
- fallback_match = re.search(r"(\d{4,5})", cleaned)
42
- if fallback_match:
43
- fallback_value = restore_decimal(fallback_match.group(1))
44
 
45
- # Check for presence of unit hints in raw_text
46
- unit = "kg" if "kg" in raw_text.lower() else "g"
47
- return f"{fallback_value} {unit}"
 
48
 
49
- return f"No valid weight found | OCR: {cleaned}"
50
  except Exception as e:
51
- return f"Error: {str(e)}"
 
3
  import torch
4
  import re
5
 
 
6
  processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
7
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
8
 
9
  def clean_ocr_text(text):
 
10
  text = text.replace(",", ".").replace("s", "5").replace("o", "0").replace("O", "0")
11
+ return re.sub(r"[^\d.kg]", "", text.lower())
12
+
13
+ def extract_unit_from_text(raw_text):
14
+ raw = raw_text.lower()
15
+ if "kg" in raw:
16
+ return "kg"
17
+ elif "g" in raw:
18
+ return "g"
19
+ return "g" # default fallback
20
 
21
  def restore_decimal(text):
22
  if re.fullmatch(r"\d{5}", text):
 
36
 
37
  cleaned = clean_ocr_text(raw_text)
38
 
39
+ match = re.search(r"(\d{1,3}\.\d{1,3})\s*(kg|g)?", cleaned)
 
40
  if match:
41
+ return f"{match.group(1)} {match.group(2) or ''}", raw_text
 
 
 
 
 
42
 
43
+ fallback = re.search(r"\d{4,5}", cleaned)
44
+ if fallback:
45
+ fixed = restore_decimal(fallback.group())
46
+ return f"{fixed}", raw_text
47
 
48
+ return f"No valid weight found | OCR: {cleaned}", raw_text
49
  except Exception as e:
50
+ return f"Error: {str(e)}", ""