aiben / src /image_utils.py
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
import os
import numpy as np
from scipy.stats import mode
from utils import have_cv2, have_pillow
from enums import images_num_max_dict
def largest_contour(contours):
""" Find the largest contour in the list. """
import cv2
largest_area = 0
largest_contour = None
for contour in contours:
area = cv2.contourArea(contour)
if area > largest_area:
largest_area = area
largest_contour = contour
return largest_contour
def is_contour_acceptable(contour, image, size_threshold=0.1, aspect_ratio_range=(0.5, 2), rotation_threshold=30):
import cv2
""" Check if the contour is acceptable based on size, aspect ratio, and rotation. """
# Size check
image_area = image.shape[0] * image.shape[1]
contour_area = cv2.contourArea(contour)
if contour_area / image_area < size_threshold or contour_area / image_area > 1 - size_threshold:
return False
# Aspect ratio check
x, y, w, h = cv2.boundingRect(contour)
aspect_ratio = w / h
if aspect_ratio < aspect_ratio_range[0] or aspect_ratio > aspect_ratio_range[1]:
return False
# Rotation check
_, _, angle = cv2.minAreaRect(contour)
if angle > rotation_threshold:
return False
return True
def file_to_cv2(img_file):
import cv2
image = cv2.imread(img_file)
assert os.path.isfile(img_file), '%s not found' % img_file
if image is None:
# e.g. small BW gif gridnumbers.gif
from PIL import Image
import numpy as np
pil_image = Image.open(img_file).convert('RGB')
pil_image_file = img_file + '.pil.png'
pil_image.save(pil_image_file)
image = cv2.imread(pil_image_file)
# open_cv_image = np.array(pil_image, dtype=np.unit8)
## Convert RGB to BGR
# image = open_cv_image[:, :, ::-1].copy()
# Check if image is loaded
if image is None:
raise ValueError("Error: Image for %s not made." % img_file)
return image
def align_image(img_file):
import cv2
from imutils.perspective import four_point_transform
try:
# Load the image
# img_file = '/home/jon/Downloads/fastfood.jpg'
# img_file = "/home/jon/Documents/reciept.jpg"
image = file_to_cv2(img_file)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5, 5), 0)
# Edge detection
edges = cv2.Canny(blur, 50, 150, apertureSize=3)
# Find contours
contours, _ = cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# Find the largest contour
largest = largest_contour(contours)
if largest is not None and is_contour_acceptable(largest, image):
# Approximate the contour to a polygon
peri = cv2.arcLength(largest, True)
approx = cv2.approxPolyDP(largest, 0.02 * peri, True)
# If the approximated contour has four points, assume it is a quadrilateral
if len(approx) == 4:
warped = four_point_transform(image, approx.reshape(4, 2))
out_file = img_file + "_aligned.jpg"
cv2.imwrite(out_file, warped)
return out_file
else:
print("Contour is not a quadrilateral.")
return img_file
else:
print("No acceptable contours found.")
return img_file
except Exception as e:
print("Error in align_image:", e, flush=True)
return img_file
def correct_rotation(img_file, border_size=50):
import cv2
# Function to rotate the image to the correct orientation
# Load the image
image = file_to_cv2(img_file)
# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Detect edges in the image
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
# Detect points that form a line using HoughLinesP
lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=80, minLineLength=100, maxLineGap=10)
if lines is None or len(lines) == 0:
return img_file
# Initialize list of angles
angles = []
# Loop over the lines and compute the angle of each line
for line in lines:
x1, y1, x2, y2 = line[0]
angle = np.degrees(np.arctan2(y2 - y1, x2 - x1))
angles.append(angle)
# Calculate the most frequent angle in the image
most_frequent_angle = mode(np.round(angles)).mode
# Assuming the receipt is horizontal, the text should be near 0 or -180/180 degrees
# We need to bring the angle to the range (-45, 45) to minimize rotation and keep the text upright
if most_frequent_angle < -45:
most_frequent_angle += 90
elif most_frequent_angle > 45:
most_frequent_angle -= 90
# Rotate the original image by the most frequent angle to correct its orientation
(h, w) = image.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, most_frequent_angle, 1.0)
corrected_image = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
# Crop the image (removing specified pixels from each border) after rotation
remove_border_final = False
if remove_border_final:
cropped_rotated_image = corrected_image[border_size:-border_size, border_size:-border_size]
else:
cropped_rotated_image = corrected_image
# Save the corrected image
out_file = img_file + "_rotated.jpg"
cv2.imwrite(out_file, cropped_rotated_image)
return out_file
def pad_resize_image_file(img_file, relaxed_resize=False):
import cv2
image = file_to_cv2(img_file)
if relaxed_resize:
postfix = "_resized.png"
image = resize_image(image, return_none_if_no_change=True, max_dimension=2048)
else:
postfix = "_pad_resized.png"
image = pad_resize_image(image, return_none_if_no_change=True)
if image is None:
new_file = img_file
else:
new_file = img_file + postfix
cv2.imwrite(new_file, image)
return new_file
def resize_image(image, return_none_if_no_change=True, max_dimension=2048):
import cv2
height, width = image.shape[:2]
# Calculate the scaling factor
if max(height, width) > max_dimension:
if height > width:
scale_factor = max_dimension / height
else:
scale_factor = max_dimension / width
# Compute new dimensions
new_dimensions = (int(width * scale_factor), int(height * scale_factor))
# Resize the image
resized_image = cv2.resize(image, new_dimensions, interpolation=cv2.INTER_AREA)
else:
# No resizing needed if the image is already within the desired dimensions
if return_none_if_no_change:
return None
resized_image = image
return resized_image
def pad_resize_image(image, return_none_if_no_change=False, max_dimension=1024):
import cv2
L = max_dimension
H = max_dimension
# Load the image
Li, Hi = image.shape[1], image.shape[0]
if Li == L and Hi == H:
if return_none_if_no_change:
return None
else:
return image
# Calculate the aspect ratio
aspect_ratio_original = Li / Hi
aspect_ratio_final = L / H
# Check the original size and determine the processing needed
if Li < L and Hi < H:
# Padding
padding_x = (L - Li) // 2
padding_y = (H - Hi) // 2
image = cv2.copyMakeBorder(image, padding_y, padding_y, padding_x, padding_x, cv2.BORDER_CONSTANT,
value=[0, 0, 0])
elif Li > L and Hi > H:
# Resizing
if aspect_ratio_original < aspect_ratio_final:
# The image is taller than the target aspect ratio
new_height = H
new_width = int(H * aspect_ratio_original)
else:
# The image is wider than the target aspect ratio
new_width = L
new_height = int(L / aspect_ratio_original)
image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
else:
# Intermediate case, resize without cropping
if aspect_ratio_original < aspect_ratio_final:
# The image is taller than the target aspect ratio
new_height = H
new_width = int(H * aspect_ratio_original)
else:
# The image is wider than the target aspect ratio
new_width = L
new_height = int(L / aspect_ratio_original)
image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
padding_x = (L - new_width) // 2
padding_y = (H - new_height) // 2
image = cv2.copyMakeBorder(image, padding_y, padding_y, padding_x, padding_x, cv2.BORDER_CONSTANT,
value=[0, 0, 0])
# debug, to see effect of pad-resize
# import cv2
# cv2.imwrite('new1.png', image)
return image
def fix_image_file(file, do_align=False, do_rotate=False, do_pad=False, relaxed_resize=False):
# always try to fix rotation/alignment since OCR better etc. in that case
if have_cv2:
if do_align:
aligned_image = align_image(file)
if aligned_image is not None and os.path.isfile(aligned_image):
file = aligned_image
if do_rotate:
derotated_image = correct_rotation(file)
if derotated_image is not None and os.path.isfile(derotated_image):
file = derotated_image
if do_pad or relaxed_resize:
file = pad_resize_image_file(file, relaxed_resize=relaxed_resize)
return file
def get_image_types():
if have_pillow:
from PIL import Image
exts = Image.registered_extensions()
image_types0 = {ex for ex, f in exts.items() if f in Image.OPEN}
image_types0 = sorted(image_types0)
image_types0 = [x[1:] if x.startswith('.') else x for x in image_types0]
else:
image_types0 = []
return image_types0
def get_image_file(image_file, image_control, document_choice, base_model=None, images_num_max=None,
image_resolution=None, image_format=None,
convert=False,
str_bytes=True):
if image_control is not None:
img_file = image_control
elif image_file is not None:
img_file = image_file
else:
image_types = get_image_types()
img_file = [x for x in document_choice if
any(x.endswith('.' + y) for y in image_types)] if document_choice else []
if not isinstance(img_file, list):
img_file = [img_file]
if isinstance(img_file, list) and not img_file:
img_file = [None]
final_img_files = []
for img_file1 in img_file:
if convert:
if img_file1 and os.path.isfile(img_file1):
from vision.utils_vision import img_to_base64
img_file1 = img_to_base64(img_file1, str_bytes=str_bytes, resolution=image_resolution,
output_format=image_format)
elif isinstance(img_file1, str):
# assume already bytes
img_file1 = img_file1
else:
img_file1 = None
final_img_files.append(img_file1)
final_img_files = [x for x in final_img_files if x]
if base_model and images_num_max == -1:
images_num_max = images_num_max_dict.get(base_model, 1)
if base_model and images_num_max is None:
images_num_max = images_num_max_dict.get(base_model, 1) or 1
if images_num_max is None:
images_num_max = len(final_img_files)
if images_num_max <= -1:
images_num_max = -images_num_max - 1
final_img_files = final_img_files[:images_num_max]
return final_img_files