from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation from PIL import Image feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b3-finetuned-cityscapes-1024-1024") model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b3-finetuned-cityscapes-1024-1024") # url = "http://images.cocodataset.org/val2017/000000039769.jpg" # image = Image.open(requests.get(url, stream=True).raw) image1, image2, image3 = 'image1', 'image2', 'image3' image1 = Image.open(image1) image2 = Image.open(image2) image3 = Image.open(image3) inputs = feature_extractor(images=[image1, image2, image3], return_tensors="pt") outputs = model(**inputs) logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)