import gradio as gr from transformers import ( AutoProcessor, VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer, MusicgenForConditionalGeneration, ) import torch from PIL import Image from openai import OpenAI from scipy.io import wavfile import numpy as np import pic2song import uuid def greet(image, image_ins): file = pic2song.pic2song(image, image_ins) return file with gr.Blocks() as demo: with gr.Column(): with gr.Row(): inp = gr.Image(type="pil", image_mode="RGB", height="500px") with gr.Column(): image_ins = gr.Slider( minimum=1, maximum=60, step=1, value=5, label="Generating length (seconds) 生成长度(秒)", ) with gr.Row(): out1 = gr.Audio() out2 = gr.Audio() with gr.Row(): out3 = gr.Audio() out4 = gr.Audio() btn = gr.Button("Run") with gr.Column(): with gr.Row(): video_out1 = gr.Video(interactive=False,height="300px",show_download_button=True) video_out2 = gr.Video(interactive=False,height="300px",show_download_button=True) video_out3 = gr.Video(interactive=False,height="300px",show_download_button=True) video_out4 = gr.Video(interactive=False,height="300px",show_download_button=True) btn.click( fn=greet, inputs=[inp, image_ins], outputs=[ out1, video_out1, out2, video_out2, out3, video_out3, out4, video_out4, ], ) demo.launch()