Spaces:

SuriC-nyc
/

MagicMusicMachine

Sleeping

suric commited on Mar 13, 2024

Commit

061e8b0

1 Parent(s): a99ae87

adjuct captioning

Files changed (3) hide show

app.py CHANGED Viewed

@@ -145,8 +145,8 @@ def show_caption(show_caption_condition, description, prompt):
         )
-def post_submit(show_caption, image_input):
-    _, description, prompt = generate_caption(image_input)
     return (
         gr.Textbox(
             label="Image Caption",
@@ -349,16 +349,6 @@ def UI():
                     generate = gr.Button(
                         "Generate Music", interactive=True, visible=False
                     )
-                    submit.click(
-                        fn=post_submit,
-                        inputs=[show_prompt, image_input],
-                        outputs=[description, prompt, generate],
-                    )
-                    show_prompt.change(
-                        fn=show_caption,
-                        inputs=[show_prompt, description, prompt],
-                        outputs=[description, prompt, generate],
-                    )
                 with gr.Column():
                     with gr.Row():
@@ -391,6 +381,16 @@ def UI():
                     )
                     transcribe_button = gr.Button("Transcribe")
                     d = gr.DownloadButton("Download the file", visible=False)
             transcribe_button.click(transcribe, inputs=[output_audio], outputs=d)
             generate.click(
                 fn=predict,

         )
+def post_submit(show_caption, model_path, image_input):
+    _, description, prompt = generate_caption(image_input, model_path)
     return (
         gr.Textbox(
             label="Image Caption",
                     generate = gr.Button(
                         "Generate Music", interactive=True, visible=False
                     )
                 with gr.Column():
                     with gr.Row():
                     )
                     transcribe_button = gr.Button("Transcribe")
                     d = gr.DownloadButton("Download the file", visible=False)
+            submit.click(
+                fn=post_submit,
+                inputs=[show_prompt, image_input, model_path],
+                outputs=[description, prompt, generate],
+                )
+            show_prompt.change(
+                fn=show_caption,
+                inputs=[show_prompt, description, prompt],
+                outputs=[description, prompt, generate],
+                )
             transcribe_button.click(transcribe, inputs=[output_audio], outputs=d)
             generate.click(
                 fn=predict,

gradio_components/image.py CHANGED Viewed

@@ -22,15 +22,25 @@ Try to make the prompt simple and concise with only 1-2 sentences
 Make sure the ouput is in JSON fomat, with two items `description` and `prompt`"""
-def generate_caption(image_file, progress=gr.Progress()):
     with open(image_file, "rb") as f:
         image_encoded = base64.b64encode(f.read()).decode("utf-8")
     progress(0, desc="Starting image captioning...")
     message = client.messages.create(
         model="claude-3-opus-20240229",
         max_tokens=1024,
-        system=SYSTEM_PROMPT,
         messages=[
             {
                 "role": "user",

 Make sure the ouput is in JSON fomat, with two items `description` and `prompt`"""
+SYSTEM_PROMPT_AUDIO = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains the detail of what background sounds this image should have. This prompt will be provided to audiogen model to generate a 15s audio clip.
+Try to make the prompt simple and concise with only 1-2 sentences
+Make sure the ouput is in JSON fomat, with two items `description` and `prompt`
+"""
+def generate_caption(image_file, model_file, progress=gr.Progress()):
+    if model_file == "facebook/audiogen-medium":
+        system_prompt = SYSTEM_PROMPT_AUDIO
+    else:
+        system_prompt = SYSTEM_PROMPT
     with open(image_file, "rb") as f:
         image_encoded = base64.b64encode(f.read()).decode("utf-8")
     progress(0, desc="Starting image captioning...")
     message = client.messages.create(
         model="claude-3-opus-20240229",
         max_tokens=1024,
+        system=system_prompt,
         messages=[
             {
                 "role": "user",

gradio_components/prediction.py CHANGED Viewed

@@ -21,6 +21,7 @@ def load_model(version="facebook/musicgen-melody"):
 def _do_predictions(
     model,
     texts,
     melodies,
@@ -65,8 +66,16 @@ def _do_predictions(
                 return_tokens=False,
             )
         else:
-            # text only
-            outputs = model.generate(texts, progress=progress, return_tokens=False)
     except RuntimeError as e:
         raise gr.Error("Error while generating " + e.args[0])
     outputs = outputs.detach().cpu().float()
@@ -132,6 +141,7 @@ def predict(
     model.set_custom_progress_callback(_progress)
     wavs = _do_predictions(
         model,
         [text],
         [melody],

 def _do_predictions(
+    model_file,
     model,
     texts,
     melodies,
                 return_tokens=False,
             )
         else:
+            if model_file == "facebook/audiogen-medium":
+                # audio condition
+                outputs = model.generate(
+                    texts,
+                    progress=progress
+                )
+            else:
+                # text only
+                outputs = model.generate(texts, progress=progress)
     except RuntimeError as e:
         raise gr.Error("Error while generating " + e.args[0])
     outputs = outputs.detach().cpu().float()
     model.set_custom_progress_callback(_progress)
     wavs = _do_predictions(
+        model_path,
         model,
         [text],
         [melody],