fffiloni commited on
Commit
4bc845a
Β·
1 Parent(s): d18abca

added lyrics optional step

Browse files
Files changed (1) hide show
  1. app.py +54 -33
app.py CHANGED
@@ -7,6 +7,8 @@ lpmc_client = gr.load("seungheondoh/LP-Music-Caps-demo", src="spaces")
7
  from gradio_client import Client
8
 
9
  client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
 
 
10
 
11
  from compel import Compel, ReturnedEmbeddingsType
12
  from diffusers import DiffusionPipeline
@@ -60,42 +62,58 @@ def solo_xd(prompt):
60
  images = pipe(prompt=prompt).images[0]
61
  return images
62
 
63
- def infer(audio_file):
 
64
 
65
  truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
66
-
 
67
  cap_result = lpmc_client(
68
  truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component
69
  api_name="predict"
70
  )
71
- print(cap_result)
72
-
73
- #summarize_q = f"""
74
-
75
- #I'll give you a list of music descriptions. Create a summary reflecting the musical ambiance.
76
- #Do not processs each segment, but provide a summary for the whole instead.
 
 
 
 
 
77
 
78
- #Here's the list:
79
-
80
- #{cap_result}
81
- #"""
82
-
83
- #summary_result = client.predict(
84
- # summarize_q, # str in 'Message' Textbox component
85
- # api_name="/chat_1"
86
- #)
87
-
88
- #print(f"SUMMARY: {summary_result}")
89
-
90
- llama_q = f"""
91
- I'll give you a music description, from i want you to provide an illustrative image description that would fit well with the music.
92
- Do not processs each segment or song, but provide a summary for the whole instead.
93
- Answer with only one image description. Never do lists. Maximum 77 tokens.
94
- Here's the music description :
95
- {cap_result}
96
 
97
- """
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  result = client.predict(
100
  llama_q, # str in 'Message' Textbox component
101
  api_name="/predict"
@@ -105,8 +123,10 @@ def infer(audio_file):
105
 
106
  print(f"Llama2 result: {result}")
107
 
108
- # β€”β€”β€”
109
-
 
 
110
  prompt = result
111
  conditioning, pooled = compel(prompt)
112
  images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
@@ -142,21 +162,22 @@ with gr.Blocks(css=css) as demo:
142
  </p>
143
  </div>""")
144
  audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
 
145
  infer_btn = gr.Button("Generate Image from Music")
146
  #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
147
  llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
148
  img_result = gr.Image(label="Image Result")
149
- tryagain_btn = gr.Button("Try again ?", visible=False)
150
 
151
- gr.Examples(examples=[["./examples/electronic.mp3"],["./examples/folk.wav"], ["./examples/orchestra.wav"]],
152
  fn=infer,
153
- inputs=[audio_input],
154
  outputs=[img_result, llama_trans_cap, tryagain_btn],
155
  cache_examples=True
156
  )
157
 
158
  #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
159
- infer_btn.click(fn=infer, inputs=[audio_input], outputs=[img_result, llama_trans_cap, tryagain_btn])
160
  tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])
161
 
162
  demo.queue(max_size=20).launch()
 
7
  from gradio_client import Client
8
 
9
  client = Client("https://fffiloni-test-llama-api.hf.space/", hf_token=hf_token)
10
+ lyrics_client = Client("https://fffiloni-music-to-lyrics.hf.space/")
11
+
12
 
13
  from compel import Compel, ReturnedEmbeddingsType
14
  from diffusers import DiffusionPipeline
 
62
  images = pipe(prompt=prompt).images[0]
63
  return images
64
 
65
+ def infer(audio_file, has_lyrics):
66
+ print("NEW INFERENCE ...")
67
 
68
  truncated_audio = cut_audio(audio_file, "trunc_audio.mp3")
69
+
70
+ print("Calling LP Music Caps...")
71
  cap_result = lpmc_client(
72
  truncated_audio, # str (filepath or URL to file) in 'audio_path' Audio component
73
  api_name="predict"
74
  )
75
+ print(f"MUSIC DESC: {cap_result}")
76
+
77
+ if has_lyrics == "Yes" :
78
+ print("""β€”β€”β€”
79
+ Getting Lyrics ...
80
+ """)
81
+ lyrics_result = lyrics_client.predict(
82
+ audio_file, # str (filepath or URL to file) in 'Song input' Audio component
83
+ fn_index=0
84
+ )
85
+ print(f"LYRICS: {lyrics_result}")
86
 
87
+ llama_q = f"""
88
+ I'll give you a music description + the lyrics of the song.
89
+ Give me an image description that would fit well with the music description, reflecting the lyrics too.
90
+ Be creative, do not do list, just an image description as required. Try to think about human characters first.
91
+ Your image description must fit well for a stable diffusion prompt.
92
+
93
+ Here's the music description :
94
+
95
+ « {cap_result} »
 
 
 
 
 
 
 
 
 
96
 
97
+ And here are the lyrics :
98
 
99
+ Β« {lyrics_result} Β»
100
+
101
+ """
102
+ elif has_lyrics == "No" :
103
+
104
+ llama_q = f"""
105
+ I'll give you a music description.
106
+ Give me an image description that would fit well with the music description.
107
+ Be creative, do not do list, just an image description as required. Try to think about human characters first.
108
+ Your image description must fit well for a stable diffusion prompt.
109
+
110
+ Here's the music description :
111
+
112
+ « {cap_result} »
113
+ """
114
+ print("""β€”β€”β€”
115
+ Calling Llama2 ...
116
+ """)
117
  result = client.predict(
118
  llama_q, # str in 'Message' Textbox component
119
  api_name="/predict"
 
123
 
124
  print(f"Llama2 result: {result}")
125
 
126
+ #Β β€”β€”β€”
127
+ print("""β€”β€”β€”
128
+ Calling SD-XL ...
129
+ """)
130
  prompt = result
131
  conditioning, pooled = compel(prompt)
132
  images = pipe(prompt_embeds=conditioning, pooled_prompt_embeds=pooled).images[0]
 
162
  </p>
163
  </div>""")
164
  audio_input = gr.Audio(label="Music input", type="filepath", source="upload")
165
+ has_lyrics = gr.Radio(label="Does your audio has lyrics ?", choices=["Yes", "No"], value="No", info="If yes, the image should reflect the lyrics, but be aware that because we add a step (getting lyrics), inference will take more time.")
166
  infer_btn = gr.Button("Generate Image from Music")
167
  #lpmc_cap = gr.Textbox(label="Lp Music Caps caption")
168
  llama_trans_cap = gr.Textbox(label="Llama translation", visible=False)
169
  img_result = gr.Image(label="Image Result")
170
+ tryagain_btn = gr.Button("Try another image ?", visible=False)
171
 
172
+ gr.Examples(examples=[["./examples/electronic.mp3", "No"],["./examples/folk.wav", "No"], ["./examples/orchestra.wav", "No"]],
173
  fn=infer,
174
+ inputs=[audio_input, has_lyrics],
175
  outputs=[img_result, llama_trans_cap, tryagain_btn],
176
  cache_examples=True
177
  )
178
 
179
  #infer_btn.click(fn=infer, inputs=[audio_input], outputs=[lpmc_cap, llama_trans_cap, img_result])
180
+ infer_btn.click(fn=infer, inputs=[audio_input, has_lyrics], outputs=[img_result, llama_trans_cap, tryagain_btn])
181
  tryagain_btn.click(fn=solo_xd, inputs=[llama_trans_cap], outputs=[img_result])
182
 
183
  demo.queue(max_size=20).launch()