from transformers import ViTFeatureExtractor, ViTForImageClassification from PIL import Image import torch.nn.functional as F import time feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') def predict(image): inputs = feature_extractor(images=image, return_tensors="pt") outputs = model(**inputs) logits = outputs.logits predicted_class_prob = F.softmax(logits, dim=-1).detach().numpy().max() predicted_class_idx = logits.argmax(-1).item() label = model.config.id2label[predicted_class_idx].split(",")[0] time.sleep(2) return {label: float(predicted_class_prob)} import gradio as gr gr.Interface(predict, gr.Image(type="pil"), "label").launch()