Spaces:

fffiloni
/

Music-To-Image

Paused

File size: 2,211 Bytes

import gradio as gr

lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
from gradio_client import Client

client = Client("https://ysharma-explore-llamav2-with-tgi.hf.space/")
sd = gr.load("runwayml/stable-diffusion-v1-5", src="spaces")

from pydub import AudioSegment

def cut_audio(input_path, output_path, max_duration=30000):
    audio = AudioSegment.from_file(input_path)

    if len(audio) > max_duration:
        audio = audio[:max_duration]

    audio.export(output_path, format="mp3")

    return output_path

def infer(audio_file):

    truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
    
    cap_result = lpmc_client(
    				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
    				api_name="predict"
    )
    print(cap_result)

    summarize_q = f"""

    I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance. 
    Do not processs each segment, but provide a summary for the whole instead.
    
    Here's the list:

    {cap_result}
    """

    summary_result = client.predict(
    				summarize_q,	# str in 'Message' Textbox component
    				api_name="/chat_1"
    )

    print(f"SUMMARY: {summary_result}")

    llama_q = f"""

    I'll give you music description, then i want you to provide an illustrative image description that would fit well with the music.
    Answer with only one image description. Never do lists.

    Here's the music description :

    {summary_result}
    
    """

    result = client.predict(
    				llama_q,	# str in 'Message' Textbox component
    				api_name="/chat_1"
    )
    
    print(result)

    image = sd(result, fn_index=0)[0]
    
    return cap_result, result, image

with gr.Blocks() as demo:
    with gr.Column(elem_id="col-container"):
        audio_input = gr.Audio(type="filepath", source="upload")
        infer_btn = gr.Button("Generate")
        lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
        llama_trans_cap = gr.Textbox(label="Llama translation")
        img_result = gr.Video(label="Result")

    infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])

demo.queue().launch()