File size: 2,555 Bytes
20bc2d5
 
 
 
 
 
 
 
46ae92f
 
20bc2d5
 
0030890
 
2245413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20bc2d5
2245413
 
 
 
 
 
46ae92f
2245413
 
 
 
 
46ae92f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2245413
 
46ae92f
20bc2d5
 
 
46ae92f
20bc2d5
449b03f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import open_clip
import torch
import requests
import numpy as np
from PIL import Image


model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:Marqo/marqo-fashionSigLIP')
tokenizer = open_clip.get_tokenizer('hf-hub:Marqo/marqo-fashionSigLIP')

def predict(inp):
    image = preprocess_val(inp).unsqueeze(0)

    # catgs = [
    #   "Shirts",
    #   "SetShirtsPants",
    #   "SetJacketsPants",
    #   "Pants",
    #   "Jeans",
    #   "JacketsCoats",
    #   "Shoes",
    #   "Underpants",
    #   "Socks",
    #   "Hats",
    #   "Wallets",
    #   "Bags",
    #   "Scarfs",
    #   "Parasols&Umbrellas",
    #   "Necklaces",
    #   "Towels&Robes",
    #   "WallObjects",
    #   "Rugs",
    #   "Glassware",
    #   "Mugs&Cups",
    #   "OralCare"
    # ]
    # text = tokenizer(catgs)

    # with torch.no_grad(), torch.cuda.amp.autocast():
    #     image_features = model.encode_image(image)
    #     image_features /= image_features.norm(dim=-1, keepdim=True) 
    #     text_features = model.encode_text(text)
    #     text_features /= text_features.norm(dim=-1, keepdim=True)
    #     text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    
    # max_prob_idx = np.argmax(text_probs)
    # pred_lbl = catgs[max_prob_idx]
    # pred_lbl_prob = text_probs[0, max_prob_idx].item()

    pred_lbl = "clothing"
    mw = ["men", "women", "boy", "girl"]
    catgs = [
        mw[0] + "s " + pred_lbl,
        mw[1] + "s " + pred_lbl,
        mw[2] + "s " + pred_lbl,
        mw[3] + "s " + pred_lbl
    ]
    text = tokenizer(catgs)
    
    with torch.no_grad(), torch.cuda.amp.autocast():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    
        text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    
    max_prob_idx = np.argmax(text_probs)
    pred_lbl_f = mw[max_prob_idx]
    pred_lbl_prob_f = text_probs[0, max_prob_idx].item()
    # tlt = f"{pred_lbl} <{100.0 * pred_lbl_prob:.1f}%> , {pred_lbl_f} <{100.0 * pred_lbl_prob_f:.1f}%>"
    tlt = f"{pred_lbl_f} <{100.0 * pred_lbl_prob_f:.1f}%>"
    return(tlt)

gr.Interface(fn=predict,
             inputs=gr.Image(type="pil"),
             outputs=gr.Label(),
             examples=["imgs/cargo.jpg", "imgs/palazzo.jpg",
                      "imgs/leggings.jpg", "imgs/dresspants.jpg"]).launch(share=True)