import gradio as gr | |
import spaces | |
import os | |
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
os.environ["USER"] = "imagecraft" | |
import gradio as gr | |
from src.model.modules.imagecraft import ImageCraft | |
model = ImageCraft.from_pretrained("nsandiman/imagecraft-ft-co-224") | |
default_image = "media/images/3.jpg" | |
def generate(image_path): | |
"""Process image inputs and generate audio response.""" | |
transcript, audio_buffer = model.generate(image_path, output_type="buffer") | |
return audio_buffer, transcript | |
imagecraft_app = gr.Interface( | |
fn=generate, | |
inputs=[ | |
gr.Image( | |
type="filepath", | |
label="Upload an image", | |
sources=["upload"], | |
value=default_image, | |
), | |
], | |
outputs=[gr.Audio(label="Speech"), gr.Textbox(label="Text")], | |
title="ImageCraft", | |
description="Upload an image and get the speech responses.", | |
allow_flagging="never", | |
) | |
if __name__ == "__main__": | |
imagecraft_app.launch() | |