Spaces:
Paused
Paused
File size: 4,303 Bytes
7718032 849fe7e 7718032 849fe7e 7718032 58d8890 450bd2c 9f04bd1 450bd2c b7d6c4c 7718032 0b066f5 4c807d1 0b066f5 7718032 b7d6c4c 7718032 b7d6c4c 6db627d 7718032 1a2c5bd e0dcf02 1a2c5bd cdb7851 1a2c5bd e0dcf02 1a2c5bd e0dcf02 1a2c5bd e0dcf02 1a2c5bd e0dcf02 b898d4b 85e7dfb 58ea2be 52fd1af 0f45386 ab9867e 0f45386 ab9867e 7718032 ab9867e 849fe7e 7718032 89285d1 7718032 0b10fd7 5178b9b 0b10fd7 6bad35a 7718032 4b738f1 0b10fd7 7718032 4b738f1 7718032 4b738f1 3ca7acb 4b738f1 9559379 3ca7acb 0b066f5 9559379 0b066f5 7718032 4b738f1 03bf86f 0b066f5 7718032 b9d57d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import gradio as gr
import os
hf_token = os.environ.get('HF_TOKEN')
lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
from gradio_client import Client
client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
from diffusers import DiffusionPipeline
import torch
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True, variant="fp16")
pipe.to("cuda")
#pipe.enable_model_cpu_offload()
# if using torch < 2.0
# pipe.enable_xformers_memory_efficient_attention()
from pydub import AudioSegment
def cut_audio(input_path, output_path, max_duration=30000):
audio = AudioSegment.from_file(input_path)
if len(audio) > max_duration:
audio = audio[:max_duration]
audio.export(output_path, format="mp3")
return output_path
def solo_xd(prompt):
images = pipe(prompt=prompt).images[0]
return images
def infer(audio_file):
truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
cap_result = lpmc_client(
truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component
api_name="predict"
)
print(cap_result)
#summarize_q = f"""
#I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance.
#Do not processs each segment, but provide a summary for the whole instead.
#Here's the list:
#{cap_result}
#"""
#summary_result = client.predict(
# summarize_q, # str in 'Message' Textbox component
# api_name="/chat_1"
#)
#print(f"SUMMARY: {summary_result}")
llama_q = f"""
I'll give you music description, then i want you to provide an illustrative image description that would fit well with the music.
Answer with only one image description. Never do lists. Do not processs each segment, but provide a summary for the whole instead.
Here's the music description :
{cap_result}
"""
result = client.predict(
llama_q, # str in 'Message' Textbox component
api_name="/predict"
)
print(f"Llama2 result: {result}")
images = pipe(prompt=result).images[0]
print("Finished")
#return cap_result, result, images
return images, result, gr.update(visible=True)
css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
Music To Image
</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Sends an audio into <a href="https://huggingface.co/spaces/seungheondoh/LP-Music-Caps-demo" target="_blank">LP-Music-Caps</a>
to generate a audio caption which is then translated to an illustrative image description with Llama2, and finally run through
Stable Diffusion XL to generate an image from the audio ! <br /><br />
Note: Only the first 30 seconds of your audio will be used for inference.
</p>
</div>""")
audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
infer_btn = gr.Button("Generate Image from Music")
#lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
img_result = gr.Image(label="Image Result")
tryagain_btn = gr.Button("Try again ?", visible=False)
#infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
infer_btn.click(fn=infer, inputs=[audio_input], outputs=[img_result, llama_trans_cap, tryagain_btn])
tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])
demo.queue(max_size=20).launch() |