File size: 3,686 Bytes
167e9d9
 
 
 
 
 
 
 
 
 
 
a45bc86
 
2e8cc5c
167e9d9
 
 
a45bc86
 
167e9d9
 
a45bc86
167e9d9
 
 
 
 
 
 
 
a45bc86
167e9d9
a45bc86
 
167e9d9
 
 
 
 
a45bc86
167e9d9
 
 
 
 
 
 
 
 
 
 
a45bc86
 
167e9d9
a45bc86
167e9d9
 
a45bc86
 
 
 
 
 
 
 
 
 
 
 
167e9d9
 
a45bc86
 
 
167e9d9
 
a45bc86
167e9d9
a45bc86
 
 
 
 
 
167e9d9
 
a45bc86
 
 
167e9d9
 
 
 
 
a45bc86
167e9d9
 
a45bc86
167e9d9
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
import torch
import clip
import pandas as pd
import hashlib
import numpy as np
import cv2
from PIL import Image
import onnxruntime as ort
import requests

def binary_array_to_hex(arr: np.ndarray) -> str:
    """Convert a binary array to a hex string."""
    bit_string = ''.join(str(int(b)) for b in arr.flatten())
    width = int(np.ceil(len(bit_string) / 4))
    return '{:0>{width}x}'.format(int(bit_string, 2), width=width)

def phash(image: Image.Image, hash_size: int = 8, highfreq_factor: int = 4) -> str:
    """Calculate the perceptual hash of an image."""
    if hash_size < 2:
        raise ValueError('Hash size must be greater than or equal to 2')
    
    import scipy.fftpack
    img_size = hash_size * highfreq_factor
    image = image.convert('L').resize((img_size, img_size), Image.Resampling.LANCZOS)
    pixels = np.asarray(image)
    dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
    dctlowfreq = dct[:hash_size, :hash_size]
    med = np.median(dctlowfreq)
    diff = dctlowfreq > med
    return binary_array_to_hex(diff)

def normalize(a: np.ndarray, axis: int = -1, order: int = 2) -> np.ndarray:
    """Normalize a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2 == 0] = 1
    return a / np.expand_dims(l2, axis)

def convert_numpy_types(data):
    """Convert numpy types to Python native types."""
    if isinstance(data, dict):
        return {key: convert_numpy_types(value) for key, value in data.items()}
    elif isinstance(data, list):
        return [convert_numpy_types(item) for item in data]
    elif isinstance(data, np.float64):
        return float(data)
    elif isinstance(data, np.int64):
        return int(data)
    else:
        return data

def download_model(url: str, path: str) -> None:
    """Download a model from a URL and save it to a file."""
    response = requests.get(url)
    with open(path, 'wb') as f:
        f.write(response.content)

# Load models outside the function
onnx_url = "https://huggingface.co/haor/aesthetics/resolve/main/aesthetic_score_mlp.onnx"
onnx_path = "aesthetic_score_mlp.onnx"
download_model(onnx_url, onnx_path)

device = "cuda" if torch.cuda.is_available() else "cpu"
ort_session = ort.InferenceSession(onnx_path)
model, preprocess = clip.load("ViT-L/14", device=device)

def predict(image: np.ndarray) -> dict:
    """Predict the aesthetic score of an image using CLIP."""
    image = Image.fromarray(image) 
    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
    phash_str = phash(image)
    md5_hash = hashlib.md5(image.tobytes()).hexdigest()
    sha1_hash = hashlib.sha1(image.tobytes()).hexdigest()
    
    inputs = preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        img_emb = model.encode_image(inputs)
        img_emb = normalize(img_emb.cpu().numpy())
        ort_inputs = {ort_session.get_inputs()[0].name: img_emb.astype(np.float32)}
        ort_outs = ort_session.run(None, ort_inputs)
        prediction = ort_outs[0].item()
        
    result = {
        "clip_aesthetic": prediction,
        "phash": phash_str,
        "md5": md5_hash, 
        "sha1": sha1_hash,
        "laplacian_variance": laplacian_variance
    }
    return convert_numpy_types(result)

title = "CLIP Aesthetic Score"
description = "Upload an image to predict its aesthetic score using the CLIP model and other metrics."

gr.Interface(
    fn=predict, 
    inputs=gr.Image(type="numpy"),
    outputs=gr.JSON(label="Result"),
    title=title,
    description=description,
    examples=[["example1.jpg"], ["example2.jpg"]]
).launch()