Spaces:

PerceptCLIP
/

models_demo

Sleeping

App Files Files Community

Amitz244 commited on Mar 20

Commit

9084867

verified ·

1 Parent(s): 57c275a

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -8

app.py CHANGED Viewed

@@ -28,6 +28,11 @@ mem_model_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_Memorability",
 mem_model.load_state_dict(torch.load(mem_model_path, map_location=device))
 mem_model.eval()
 # Emotion label mapping
 idx2label = {
@@ -54,7 +59,7 @@ emotion_emoji = {
 }
 # Image preprocessing
-def emo_preprocess(image):
     transform = transforms.Compose([
         transforms.Resize(224),
         transforms.CenterCrop(224),
@@ -63,24 +68,45 @@ def emo_preprocess(image):
     ])
     return transform(image).unsqueeze(0).to(device)
 # Inference function
-def predict_emotion(image):
     # If the image is passed as a PIL Image
     if isinstance(image, Image.Image):
         img = image.convert("RGB")
     else:
-        img = Image.open(image).convert("RGB")
-    img = emo_preprocess(img)
     with torch.no_grad():
         mem_score = mem_model(img).item()
         outputs = emotion_model(img)
         predicted = outputs.argmax(1).item()
     emotion = idx2label[predicted]
     emoji = emotion_emoji.get(emotion, "❓")
-    return f"{emotion} {emoji}", f"{mem_score:.4f}"
@@ -94,9 +120,9 @@ example_images = [
 # Create Gradio interface with custom CSS
 iface = gr.Interface(
-    fn=predict_emotion,
     inputs=gr.Image(type="pil", label="Upload an Image"),
-    outputs=[gr.Textbox(label="Emotion"), gr.Textbox(label="Memorability Score")],
     title="PerceptCLIP",
     description="This is an official demo of PerceptCLIP from the paper: [Don’t Judge Before You CLIP: A Unified Approach for Perceptual Tasks](https://arxiv.org/pdf/2503.13260). For each specific task, we fine-tune CLIP with LoRA and an MLP head. Our models achieve state-of-the-art performance. \nThis demo shows results from three models, each corresponding to a different task - visual emotion analysis, memorability prediction, and image quality assessment.",
     examples=example_images

 mem_model.load_state_dict(torch.load(mem_model_path, map_location=device))
 mem_model.eval()
+# IQA model
+iqa_model = clip_lora_model(output_dim=1).to(device)
+iqa_model_path = hf_hub_download(repo_id="PerceptCLIP/PerceptCLIP_IQA", filename="perceptCLIP_IQA.pth")
+iqa_model.load_state_dict(torch.load(iqa_model_path, map_location=device))
+iqa_model.eval()
 # Emotion label mapping
 idx2label = {
 }
 # Image preprocessing
+def emo_mem_preprocess(image):
     transform = transforms.Compose([
         transforms.Resize(224),
         transforms.CenterCrop(224),
     ])
     return transform(image).unsqueeze(0).to(device)
+def IQA_preprocess():
+    random.seed(3407)
+    transform = transforms.Compose([
+      transforms.Resize((512,384)),
+      transforms.RandomCrop(size=(224,224)),
+      transforms.ToTensor(),
+      transforms.Normalize(mean=(0.48145466, 0.4578275, 0.40821073),
+                             std=(0.26862954, 0.26130258, 0.27577711))
+    ])
+    return transform
 # Inference function
+def predict_percept(image):
     # If the image is passed as a PIL Image
     if isinstance(image, Image.Image):
         img = image.convert("RGB")
     else:
+        img = Image.open(image).convert("RGB")
+    batch = torch.stack([IQA_preprocess()(image) for _ in range(15)]).to(device)  # Shape: (15, 3, 224, 224)
+    img = emo_mem_preprocess(img)
     with torch.no_grad():
+        iqa_score = model(batch).cpu().numpy()
         mem_score = mem_model(img).item()
         outputs = emotion_model(img)
         predicted = outputs.argmax(1).item()
+    iqa_score = np.mean(scores)
+    min_iqa_pred = -100
+    max_iqa_pred = 100
+    max_iqa_score = 0
+    min_iqa_score = 5
+    normalized_iqa_score = ((iqa_score - min_iqa_pred) / (max_iqa_pred - min_iqa_pred)) * (max_iqa_score - min_iqa_score) + min_iqa_score
     emotion = idx2label[predicted]
     emoji = emotion_emoji.get(emotion, "❓")
+    return f"{emotion} {emoji}", f"{mem_score:.4f}", f"{normalized_iqa_score:.4f}"
 # Create Gradio interface with custom CSS
 iface = gr.Interface(
+    fn=predict_percept,
     inputs=gr.Image(type="pil", label="Upload an Image"),
+    outputs=[gr.Textbox(label="Emotion"), gr.Textbox(label="Memorability Score"), gr.Textbox(label="IQA Score")]
     title="PerceptCLIP",
     description="This is an official demo of PerceptCLIP from the paper: [Don’t Judge Before You CLIP: A Unified Approach for Perceptual Tasks](https://arxiv.org/pdf/2503.13260). For each specific task, we fine-tune CLIP with LoRA and an MLP head. Our models achieve state-of-the-art performance. \nThis demo shows results from three models, each corresponding to a different task - visual emotion analysis, memorability prediction, and image quality assessment.",
     examples=example_images