import gradio as gr import transformers import torch import re import gradio as gr from PIL import Image from transformers import pipeline from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel import os os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' device='cpu' model_id = "nttdataspain/vit-gpt2-stablediffusion2-lora" model = VisionEncoderDecoderModel.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) feature_extractor = ViTFeatureExtractor.from_pretrained(model_id) def predict(image): img = image.convert('RGB') model.eval() pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values with torch.no_grad(): output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True) preds = [pred.strip() for pred in preds] return preds[0] # def get_image(img): # image=pipe(img) # return image[0]['generated_text'] image=gr.Interface(predict,title='Image to text',inputs= gr.Image(label="Upload any Image", type = 'pil'),outputs='text').launch(share=True)