Image Classification
timm
drhead commited on
Commit
44425c0
1 Parent(s): 9cb911a

create PILOT2 inference script

Browse files
Files changed (1) hide show
  1. JTP_PILOT2/inference_gradio.py +223 -0
JTP_PILOT2/inference_gradio.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ from PIL import Image
5
+ import safetensors.torch
6
+ import timm
7
+ from timm.models import VisionTransformer
8
+ import torch
9
+ from torchvision.transforms import transforms
10
+ from torchvision.transforms import InterpolationMode
11
+ import torchvision.transforms.functional as TF
12
+
13
+ torch.set_grad_enabled(False)
14
+
15
+ class Fit(torch.nn.Module):
16
+ def __init__(
17
+ self,
18
+ bounds: tuple[int, int] | int,
19
+ interpolation = InterpolationMode.LANCZOS,
20
+ grow: bool = True,
21
+ pad: float | None = None
22
+ ):
23
+ super().__init__()
24
+
25
+ self.bounds = (bounds, bounds) if isinstance(bounds, int) else bounds
26
+ self.interpolation = interpolation
27
+ self.grow = grow
28
+ self.pad = pad
29
+
30
+ def forward(self, img: Image) -> Image:
31
+ wimg, himg = img.size
32
+ hbound, wbound = self.bounds
33
+
34
+ hscale = hbound / himg
35
+ wscale = wbound / wimg
36
+
37
+ if not self.grow:
38
+ hscale = min(hscale, 1.0)
39
+ wscale = min(wscale, 1.0)
40
+
41
+ scale = min(hscale, wscale)
42
+ if scale == 1.0:
43
+ return img
44
+
45
+ hnew = min(round(himg * scale), hbound)
46
+ wnew = min(round(wimg * scale), wbound)
47
+
48
+ img = TF.resize(img, (hnew, wnew), self.interpolation)
49
+
50
+ if self.pad is None:
51
+ return img
52
+
53
+ hpad = hbound - hnew
54
+ wpad = wbound - wnew
55
+
56
+ tpad = hpad // 2
57
+ bpad = hpad - tpad
58
+
59
+ lpad = wpad // 2
60
+ rpad = wpad - lpad
61
+
62
+ return TF.pad(img, (lpad, tpad, rpad, bpad), self.pad)
63
+
64
+ def __repr__(self) -> str:
65
+ return (
66
+ f"{self.__class__.__name__}(" +
67
+ f"bounds={self.bounds}, " +
68
+ f"interpolation={self.interpolation.value}, " +
69
+ f"grow={self.grow}, " +
70
+ f"pad={self.pad})"
71
+ )
72
+
73
+ class CompositeAlpha(torch.nn.Module):
74
+ def __init__(
75
+ self,
76
+ background: tuple[float, float, float] | float,
77
+ ):
78
+ super().__init__()
79
+
80
+ self.background = (background, background, background) if isinstance(background, float) else background
81
+ self.background = torch.tensor(self.background).unsqueeze(1).unsqueeze(2)
82
+
83
+ def forward(self, img: torch.Tensor) -> torch.Tensor:
84
+ if img.shape[-3] == 3:
85
+ return img
86
+
87
+ alpha = img[..., 3, None, :, :]
88
+
89
+ img[..., :3, :, :] *= alpha
90
+
91
+ background = self.background.expand(-1, img.shape[-2], img.shape[-1])
92
+ if background.ndim == 1:
93
+ background = background[:, None, None]
94
+ elif background.ndim == 2:
95
+ background = background[None, :, :]
96
+
97
+ img[..., :3, :, :] += (1.0 - alpha) * background
98
+ return img[..., :3, :, :]
99
+
100
+ def __repr__(self) -> str:
101
+ return (
102
+ f"{self.__class__.__name__}(" +
103
+ f"background={self.background})"
104
+ )
105
+
106
+ transform = transforms.Compose([
107
+ Fit((384, 384)),
108
+ transforms.ToTensor(),
109
+ CompositeAlpha(0.5),
110
+ transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True),
111
+ transforms.CenterCrop((384, 384)),
112
+ ])
113
+
114
+ model = timm.create_model(
115
+ "vit_so400m_patch14_siglip_384.webli",
116
+ pretrained=False,
117
+ num_classes=9083,
118
+ ) # type: VisionTransformer
119
+
120
+ class GatedHead(torch.nn.Module):
121
+ def __init__(self,
122
+ num_features: int,
123
+ num_classes: int
124
+ ):
125
+ super().__init__()
126
+ self.num_classes = num_classes
127
+ self.linear = torch.nn.Linear(num_features, num_classes * 2)
128
+
129
+ self.act = torch.nn.Sigmoid()
130
+ self.gate = torch.nn.Sigmoid()
131
+
132
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
133
+ x = self.linear(x)
134
+ x = self.act(x[:, :self.num_classes]) * self.gate(x[:, self.num_classes:])
135
+ return x
136
+
137
+ model.head = GatedHead(min(model.head.weight.shape), 9083)
138
+
139
+ safetensors.torch.load_model(model, "JTP_PILOT2-2-e3-vit_so400m_patch14_siglip_384.safetensors")
140
+
141
+ if torch.cuda.is_available():
142
+ model.cuda()
143
+ if torch.cuda.get_device_capability()[0] >= 7: # tensor cores
144
+ model.to(dtype=torch.float16, memory_format=torch.channels_last)
145
+
146
+ model.eval()
147
+
148
+ with open("tagger_tags.json", "r") as file:
149
+ tags = json.load(file) # type: dict
150
+ allowed_tags = list(tags.keys())
151
+
152
+ for idx, tag in enumerate(allowed_tags):
153
+ allowed_tags[idx] = tag.replace("_", " ")
154
+
155
+ sorted_tag_score = {}
156
+
157
+ def run_classifier(image, threshold):
158
+ global sorted_tag_score
159
+ img = image.convert('RGB')
160
+ tensor = transform(img).unsqueeze(0)
161
+
162
+ if torch.cuda.is_available():
163
+ tensor = tensor.cuda()
164
+ if torch.cuda.get_device_capability()[0] >= 7: # tensor cores
165
+ tensor = tensor.to(dtype=torch.float16, memory_format=torch.channels_last)
166
+
167
+ with torch.no_grad():
168
+ probits = model(tensor)[0].cpu()
169
+ values, indices = probits.topk(250)
170
+
171
+ tag_score = dict()
172
+ for i in range(indices.size(0)):
173
+ tag_score[allowed_tags[indices[i]]] = values[i].item()
174
+ sorted_tag_score = dict(sorted(tag_score.items(), key=lambda item: item[1], reverse=True))
175
+
176
+ return create_tags(threshold)
177
+
178
+ def create_tags(threshold):
179
+ global sorted_tag_score
180
+ filtered_tag_score = {key: value for key, value in sorted_tag_score.items() if value > threshold}
181
+ text_no_impl = ", ".join(filtered_tag_score.keys())
182
+ return text_no_impl, filtered_tag_score
183
+
184
+ def clear_image():
185
+ global sorted_tag_score
186
+ sorted_tag_score = {}
187
+ return "", {}
188
+
189
+ with gr.Blocks(css=".output-class { display: none; }") as demo:
190
+ gr.Markdown("""
191
+ ## Joint Tagger Project: JTP-PILOT² Demo **BETA**
192
+ This tagger is designed for use on furry images (though may very well work on out-of-distribution images, potentially with funny results). A threshold of 0.2 is recommended. Lower thresholds often turn up more valid tags, but can also result in some amount of hallucinated tags.
193
+ This tagger is the result of joint efforts between members of the RedRocket team, with distinctions given to Thessalo for creating the foundation for this project with his efforts, RedHotTensors for redesigning the process into a second-order method that models information expectation, and drhead for dataset prep, creation of training code and supervision of training runs.
194
+ Special thanks to Minotoro at frosting.ai for providing the compute power for this project.
195
+ """)
196
+ with gr.Row():
197
+ with gr.Column():
198
+ image_input = gr.Image(label="Source", sources=['upload'], type='pil', height=512, show_label=False)
199
+ threshold_slider = gr.Slider(minimum=0.00, maximum=1.00, step=0.01, value=0.20, label="Threshold")
200
+ with gr.Column():
201
+ tag_string = gr.Textbox(label="Tag String")
202
+ label_box = gr.Label(label="Tag Predictions", num_top_classes=250, show_label=False)
203
+
204
+ image_input.upload(
205
+ fn=run_classifier,
206
+ inputs=[image_input, threshold_slider],
207
+ outputs=[tag_string, label_box]
208
+ )
209
+
210
+ image_input.clear(
211
+ fn=clear_image,
212
+ inputs=[],
213
+ outputs=[tag_string, label_box]
214
+ )
215
+
216
+ threshold_slider.input(
217
+ fn=create_tags,
218
+ inputs=[threshold_slider],
219
+ outputs=[tag_string, label_box]
220
+ )
221
+
222
+ if __name__ == "__main__":
223
+ demo.launch()