File size: 4,910 Bytes
19fe404
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
from typing import List

import clip
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from torchvision.datasets.utils import download_url
from transformers import AutoModel, AutoProcessor

# All metrics.
__all__ = ["AestheticScore", "CLIPScore"]

_MODELS = {
    "CLIP_ViT-L/14": "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/video_caption/clip/ViT-L-14.pt",
    "Aesthetics_V2": "https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/easyanimate/video_caption/clip/sac%2Blogos%2Bava1-l14-linearMSE.pth",
}
_MD5 = {
    "CLIP_ViT-L/14": "096db1af569b284eb76b3881534822d9",
    "Aesthetics_V2": "b1047fd767a00134b8fd6529bf19521a",
}


# if you changed the MLP architecture during training, change it also here:
class _MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.input_size = input_size
        self.layers = nn.Sequential(
            nn.Linear(self.input_size, 1024),
            # nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(1024, 128),
            # nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            # nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 16),
            # nn.ReLU(),
            nn.Linear(16, 1),
        )

    def forward(self, x):
        return self.layers(x)


class AestheticScore:
    """Compute LAION Aesthetics Score V2 based on openai/clip. Note that the default
    inference dtype with GPUs is fp16 in openai/clip.

    Ref:
    1. https://github.com/christophschuhmann/improved-aesthetic-predictor/blob/main/simple_inference.py.
    2. https://github.com/openai/CLIP/issues/30.
    """

    def __init__(self, root: str = "~/.cache/clip", device: str = "cpu"):
        # The CLIP model is loaded in the evaluation mode.
        self.root = os.path.expanduser(root)
        if not os.path.exists(self.root):
            os.makedirs(self.root)
        filename = "ViT-L-14.pt"
        download_url(_MODELS["CLIP_ViT-L/14"], self.root, filename=filename, md5=_MD5["CLIP_ViT-L/14"])
        self.clip_model, self.preprocess = clip.load(os.path.join(self.root, filename), device=device)
        self.device = device
        self._load_mlp()

    def _load_mlp(self):
        filename = "sac+logos+ava1-l14-linearMSE.pth"
        download_url(_MODELS["Aesthetics_V2"], self.root, filename=filename, md5=_MD5["Aesthetics_V2"])
        state_dict = torch.load(os.path.join(self.root, filename))
        self.mlp = _MLP(768)
        self.mlp.load_state_dict(state_dict)
        self.mlp.to(self.device)
        self.mlp.eval()

    def __call__(self, images: List[Image.Image], texts=None) -> List[float]:
        with torch.no_grad():
            images = torch.stack([self.preprocess(image) for image in images]).to(self.device)
            image_embs = F.normalize(self.clip_model.encode_image(images))
            scores = self.mlp(image_embs.float())  # torch.float16 -> torch.float32, [N, 1]
        return scores.squeeze().tolist()
    
    def __repr__(self) -> str:
        return "aesthetic_score"


class CLIPScore:
    """Compute CLIP scores for image-text pairs based on huggingface/transformers."""

    def __init__(
        self,
        model_name_or_path: str = "openai/clip-vit-large-patch14",
        torch_dtype=torch.float16,
        device: str = "cpu",
    ):
        self.model = AutoModel.from_pretrained(model_name_or_path, torch_dtype=torch_dtype).eval().to(device)
        self.processor = AutoProcessor.from_pretrained(model_name_or_path)
        self.torch_dtype = torch_dtype
        self.device = device

    def __call__(self, images: List[Image.Image], texts: List[str]) -> List[float]:
        assert len(images) == len(texts)
        image_inputs = self.processor(images=images, return_tensors="pt")  # {"pixel_values": }
        if self.torch_dtype == torch.float16:
            image_inputs["pixel_values"] = image_inputs["pixel_values"].half()
        text_inputs = self.processor(text=texts, return_tensors="pt", padding=True, truncation=True)  # {"inputs_id": }
        image_inputs, text_inputs = image_inputs.to(self.device), text_inputs.to(self.device)
        with torch.no_grad():
            image_embs = F.normalize(self.model.get_image_features(**image_inputs))
            text_embs = F.normalize(self.model.get_text_features(**text_inputs))
            scores = text_embs @ image_embs.T  # [N, N]

        return scores.diagonal().tolist()
    
    def __repr__(self) -> str:
        return "clip_score"


if __name__ == "__main__":
    aesthetic_score = AestheticScore(device="cuda")
    clip_score = CLIPScore(device="cuda")

    paths = ["demo/splash_cl2_midframe.jpg"] * 3
    texts = ["a joker", "a woman", "a man"]
    images = [Image.open(p).convert("RGB") for p in paths]

    print(aesthetic_score(images))
    print(clip_score(images, texts))