import os import gradio as gr from transformers import pipeline from IPython.display import Audio as IPythonAudio pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") def launch(input): out = pipe(input) out_tts = tts_pipe(out[0]['generated_text']) return out[0]['generated_text'], (out_tts["sampling_rate"],out_tts["audio"][0]) iface = gr.Interface(launch, inputs=gr.Image(type='pil'), outputs=["text","audio"]) iface.launch()