File size: 2,211 Bytes
7718032
 
 
 
 
 
5178b9b
7718032
b7d6c4c
 
 
 
 
 
 
 
 
 
 
7718032
 
b7d6c4c
 
7718032
 
b7d6c4c
6db627d
7718032
 
 
e0dcf02
 
cdb7851
1b3bbfe
cdb7851
e0dcf02
 
 
 
 
 
 
 
 
 
 
 
0f45386
 
85e7dfb
5178b9b
0f45386
 
 
e0dcf02
0f45386
 
 
7718032
0f45386
f946a20
7718032
 
 
5178b9b
 
7718032
5178b9b
7718032
 
 
 
 
 
 
 
 
5178b9b
7718032
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr

lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
from gradio_client import Client

client = Client("https://ysharma-explore-llamav2-with-tgi.hf.space/")
sd = gr.load("runwayml/stable-diffusion-v1-5", src="spaces")

from pydub import AudioSegment

def cut_audio(input_path, output_path, max_duration=30000):
    audio = AudioSegment.from_file(input_path)

    if len(audio) > max_duration:
        audio = audio[:max_duration]

    audio.export(output_path, format="mp3")

    return output_path

def infer(audio_file):

    truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
    
    cap_result = lpmc_client(
    				truncated_audio,	# str (filepath or URL to file) in 'audio_path' Audio component
    				api_name="predict"
    )
    print(cap_result)

    summarize_q = f"""

    I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance. 
    Do not processs each segment, but provide a summary for the whole instead.
    
    Here's the list:

    {cap_result}
    """

    summary_result = client.predict(
    				summarize_q,	# str in 'Message' Textbox component
    				api_name="/chat_1"
    )

    print(f"SUMMARY: {summary_result}")

    llama_q = f"""

    I'll give you music description, then i want you to provide an illustrative image description that would fit well with the music.
    Answer with only one image description. Never do lists.

    Here's the music description :

    {summary_result}
    
    """

    result = client.predict(
    				llama_q,	# str in 'Message' Textbox component
    				api_name="/chat_1"
    )
    
    print(result)

    image = sd(result, fn_index=0)[0]
    
    return cap_result, result, image

with gr.Blocks() as demo:
    with gr.Column(elem_id="col-container"):
        audio_input = gr.Audio(type="filepath", source="upload")
        infer_btn = gr.Button("Generate")
        lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
        llama_trans_cap = gr.Textbox(label="Llama translation")
        img_result = gr.Video(label="Result")

    infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])

demo.queue().launch()