import pytesseract import numpy as np import cv2 import re from PIL import Image import logging import sys # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)]) def preprocess_image(img): """Preprocess image for robust OCR, optimized for various weight display formats.""" try: # Convert PIL to OpenCV format img = np.array(img) img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # Convert to grayscale gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Enhance contrast for diverse lighting conditions clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8, 8)) enhanced = clahe.apply(gray) # Apply adaptive thresholding with flexible block size block_size = max(11, min(31, int(img.shape[0] / 20) * 2 + 1)) thresh = cv2.adaptiveThreshold( enhanced, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, block_size, 3 ) # Sharpen to enhance edges kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]]) sharpened = cv2.filter2D(thresh, -1, kernel) # Denoise for noisy images denoised = cv2.fastNlMeansDenoising(sharpened, h=10) return denoised except Exception as e: logging.error(f"Preprocessing failed: {str(e)}") return gray # Fallback to grayscale def extract_weight_from_image(pil_img): """Extract weight and unit from a digital scale image, supporting various formats.""" try: # Preprocess image thresh = preprocess_image(pil_img) # Convert to PIL for Tesseract pil_img = Image.fromarray(cv2.cvtColor(thresh, cv2.COLOR_GRAY2RGB)) # Try Tesseract with optimized config for numbers and units config = r'--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789.,kgKg' text = pytesseract.image_to_string(pil_img, config=config) logging.info(f"Tesseract raw output: {text}") # Clean and validate text text = text.strip().lower() text = re.sub(r'\s+', '', text) # Remove extra spaces # Extract weight and unit match = re.search(r'(\d*\.?\d+)([kgkg]?)', text) if match: weight_str = match.group(1) unit = match.group(2) if match.group(2) else "g" # Default to grams if no unit weight_str = weight_str.replace(',', '.') # Handle decimal formats (e.g., 68,0) if re.fullmatch(r'^\d*\.?\d+$', weight_str): weight_str = weight_str.lstrip('0') or '0' confidence = 95.0 if len(weight_str.replace('.', '')) >= 3 else 90.0 try: weight = float(weight_str) if 0.001 <= weight <= 5000: logging.info(f"Detected weight: {weight} {unit}, Confidence: {confidence:.2f}%") return weight_str, confidence, unit except ValueError: logging.warning(f"Invalid weight format: {weight_str}") logging.info("No valid weight detected.") return "Not detected", 0.0, "" except Exception as e: logging.error(f"Weight extraction failed: {str(e)}") return "Not detected", 0.0, ""