File size: 5,310 Bytes
f583d37
 
 
 
 
 
f55fe1e
8931efd
f583d37
7d1864e
 
 
 
 
 
 
 
f583d37
 
 
 
 
 
 
 
 
 
 
06b2bc8
 
5eeb043
 
 
f583d37
06b2bc8
 
 
 
 
 
9084867
 
 
 
 
06b2bc8
f583d37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9084867
eed280f
f583d37
 
 
 
 
 
 
9084867
 
 
 
 
 
 
 
 
 
 
7d1864e
 
f583d37
9084867
950e2fc
c163326
 
 
9084867
 
 
 
f583d37
 
d8e3261
5eeb043
 
f583d37
9084867
6b8d82d
99ad3e6
 
9084867
fa793f8
f583d37
 
950e2fc
9084867
5eeb043
06b2bc8
cb48075
950e2fc
cb48075
46a1e22
cb48075
 
 
fa793f8
 
cb48075
fad567e
 
f583d37
9084867
cb48075
4f2ba29
fad567e
57c275a
cfe19c7
f583d37
 
545b7e6
f583d37
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import torch
import gradio as gr
from PIL import Image
from huggingface_hub import hf_hub_download
import importlib.util
from torchvision import transforms
import random
import numpy as np

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        
# Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Download model code
class_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_Emotions", filename="modeling.py")
spec = importlib.util.spec_from_file_location("modeling", class_path)
modeling = importlib.util.module_from_spec(spec)
spec.loader.exec_module(modeling)

from modeling import clip_lora_model

# Emotions model
emotion_model = clip_lora_model().to(device)
emotion_model_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_Emotions", filename="perceptCLIP_Emotions.pth")
emotion_model.load_state_dict(torch.load(emotion_model_path, map_location=device))
emotion_model.eval()

# Memorability model
mem_model = clip_lora_model(output_dim=1).to(device)
mem_model_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_Memorability", filename="perceptCLIP_Memorability.pth")
mem_model.load_state_dict(torch.load(mem_model_path, map_location=device))
mem_model.eval()

# IQA model
iqa_model = clip_lora_model(output_dim=1).to(device)
iqa_model_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_IQA", filename="perceptCLIP_IQA.pth")
iqa_model.load_state_dict(torch.load(iqa_model_path, map_location=device))
iqa_model.eval()

# Emotion label mapping
idx2label = {
    0: "amusement",
    1: "awe",
    2: "contentment",
    3: "excitement",
    4: "anger",
    5: "disgust",
    6: "fear",
    7: "sadness"
}

# Emoji mapping
emotion_emoji = {
    "amusement": "πŸ˜‚",
    "awe": "😲",
    "contentment": "😊",
    "excitement": "πŸ˜ƒ",
    "anger": "😠",
    "disgust": "🀒",
    "fear": "😱",
    "sadness": "😞"
}

# Image preprocessing
def emo_mem_preprocess(image):
    transform = transforms.Compose([ 
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.4814, 0.4578, 0.4082), std=(0.2686, 0.2613, 0.2758)),
    ])
    return transform(image).unsqueeze(0).to(device)

def IQA_preprocess():
    random.seed(3407)
    transform = transforms.Compose([
      transforms.Resize((512,384)),
      transforms.RandomCrop(size=(224,224)),  
      transforms.ToTensor(),
      transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), 
                             std=(0.26862954, 0.26130258, 0.27577711))
    ])
    return transform

set_seed(3407)

# Inference function
def predict_percept(image):
    # If the image is passed as a PIL Image
    if isinstance(image, Image.Image):  
        img = image.convert("RGB")
    else:
        img = Image.open(image).convert("RGB")

    batch = torch.stack([IQA_preprocess()(image) for _ in range(15)]).to(device)  # Shape: (15, 3, 224, 224)
    img = emo_mem_preprocess(img)

    with torch.no_grad():
        iqa_score = iqa_model(batch).cpu().numpy()
        mem_score = mem_model(img).item()
        outputs = emotion_model(img)
        predicted = outputs.argmax(1).item()
        
    iqa_score = np.mean(iqa_score)
    min_iqa_pred = -6.52
    max_iqa_pred = 3.11

    normalized_iqa_score = ((iqa_score - min_iqa_pred) / (max_iqa_pred - min_iqa_pred))

    emotion = idx2label[predicted]
    emoji = emotion_emoji.get(emotion, "❓")  
    return f"{emotion} {emoji}", f"{mem_score:.4f}", f"{normalized_iqa_score:.4f}"
    


# Example images 
example_images = [
    "https://webneel.com/daily/sites/default/files/images/daily/02-2013/3-motion-blur-speed-photography.jpg",
    "https://img.freepik.com/free-photo/emotive-excited-female-with-dark-skin-crisp-hair-keeps-hands-clenched-fists-exclaims-with-positiveness-as-achieved-success-her-career-opens-mouth-widely-isolated-white-wall_273609-16443.jpg",
    "https://t4.ftcdn.net/jpg/01/18/44/59/360_F_118445958_NtP7tIsD0CBPyG7Uad7Z2KxVWrsfCPjP.jpg", 
    "https://apnapestcontrol.ca/wp-content/uploads/2019/02/9.jpg",  
    "https://images.pexels.com/photos/1107717/pexels-photo-1107717.jpeg?cs=srgb&dl=pexels-fotios-photos-1107717.jpg&fm=jpg",
    "https://cdn.prod.website-files.com/60e4d0d0155e62117f4faef3/61fab92edbb1ccbc7d12c167_Brian-Matiash-Puppy.jpeg",
]

# Create Gradio interface with custom CSS
iface = gr.Interface(
    fn=predict_percept,
    inputs=gr.Image(type="pil", label="Upload an Image"),
    outputs=[gr.Textbox(label="Emotion"), gr.Textbox(label="Memorability Score"), gr.Textbox(label="IQA Score")],
    title="PerceptCLIP",
    description="This is an official demo of PerceptCLIP from the paper: [Don’t Judge Before You CLIP: A Unified Approach for Perceptual Tasks](https://arxiv.org/pdf/2503.13260). For each specific task, we fine-tune CLIP with LoRA and an MLP head. Our models achieve state-of-the-art performance. \nThis demo shows results from three models, each corresponding to a different task - visual emotion analysis, memorability prediction, and image quality assessment.",
    examples=example_images
)


if __name__ == "__main__":
    iface.launch()