Spaces:

fffiloni
/

Image2SFX-comparison

Running

App Files Files Community

885

by hualing0222 - opened Dec 28, 2024

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

-23

This PR is in draft mode

Files changed (3) hide show

README.md +1 -1
app.py +3 -21
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 👂
 colorFrom: green
 colorTo: pink
 sdk: gradio
-sdk_version: 5.38.0
 app_file: app.py
 pinned: false
 short_description: Generates audio environment from an image

 colorFrom: green
 colorTo: pink
 sdk: gradio
+sdk_version: 5.0.1
 app_file: app.py
 pinned: false
 short_description: Generates audio environment from an image

app.py CHANGED Viewed

@@ -26,7 +26,6 @@ def extract_audio(video_in):
     return 'audio.wav'
 def get_caption_from_kosmos(image_in):
-    gr.Info("Generating image caption with Kosmos2...")
     kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
     kosmos2_result = kosmos2_client.predict(
 		image_input=handle_file(image_in),
@@ -87,7 +86,6 @@ def get_magnet(prompt):
         raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
 def get_audioldm(prompt):
-    gr.Info("Now calling AudioLDM2 for SFX ...")
     try:
         client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
         seed = random.randint(0, MAX_SEED)
@@ -119,7 +117,6 @@ def get_audiogen(prompt):
         raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
 def get_tango(prompt):
-    gr.Info("Now calling AudioGen for SFX ...")
     try:
         client = Client("fffiloni/tango", hf_token=hf_token)
         result = client.predict(
@@ -153,7 +150,6 @@ def get_tango2(prompt):
 def get_stable_audio_open(prompt):
-    gr.Info("Now calling Stable-Audio for SFX ...")
     try:
         client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
         result = client.predict(
@@ -188,20 +184,6 @@ def get_ezaudio(prompt):
         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
-    """
-    Generate an audio clip (sound effect) from an input image using the selected generative model.
-    This function first generates a caption from the provided image using a vision-language model.
-    The caption is then used as a text prompt for various audio generation models.
-    Args:
-        image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
-        chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".
-    Returns:
-        str | dict: The path or result object of the generated audio clip, depending on the model used.
-    """
     caption = get_caption_from_kosmos(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
@@ -251,9 +233,9 @@ with gr.Blocks(css=css) as demo:
                     "AudioLDM-2",
                     #"AudioGen",
                     "Tango",
-                    #"Tango 2",
                     "Stable Audio Open",
-                    #"EzAudio"
                 ], value="AudioLDM-2")
                 submit_btn = gr.Button("Submit")
         with gr.Column():
@@ -270,4 +252,4 @@ with gr.Blocks(css=css) as demo:
         outputs=[audio_o],
     )
-demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True)

     return 'audio.wav'
 def get_caption_from_kosmos(image_in):
     kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
     kosmos2_result = kosmos2_client.predict(
 		image_input=handle_file(image_in),
         raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
 def get_audioldm(prompt):
     try:
         client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
         seed = random.randint(0, MAX_SEED)
         raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
 def get_tango(prompt):
     try:
         client = Client("fffiloni/tango", hf_token=hf_token)
         result = client.predict(
 def get_stable_audio_open(prompt):
     try:
         client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
         result = client.predict(
         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
     caption = get_caption_from_kosmos(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
                     "AudioLDM-2",
                     #"AudioGen",
                     "Tango",
+                    "Tango 2",
                     "Stable Audio Open",
+                    "EzAudio"
                 ], value="AudioLDM-2")
                 submit_btn = gr.Button("Submit")
         with gr.Column():
         outputs=[audio_o],
     )
+demo.queue(max_size=10).launch(debug=True, show_error=True)

requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- moviepy<2


1	+ moviepy