Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

App Files Files Community

Bils commited on Jan 29

Commit

a4f881b

verified ·

1 Parent(s): 95e77bd

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -80

app.py CHANGED Viewed

@@ -7,13 +7,15 @@ import torch
 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
-from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
@@ -26,120 +28,151 @@ pipe = DiffusionPipeline.from_pretrained(
 )
 @spaces.GPU(duration=120)
-def analyze_image_with_free_model(image_file):
     try:
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
-            temp_file.write(image_file)
-            temp_image_path = temp_file.name
-        results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
-        if not caption:
-            return "No caption was generated.", True
-        return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
-def get_audioldm_from_caption(caption):
     try:
         pipe.to("cuda")
         audio_output = pipe(
-            prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
         pipe.to("cpu")
-        audio = audio_output.audios[0]
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-            write(temp_wav.name, 16000, audio)
-            return temp_wav.name
     except Exception as e:
-        print(f"Error generating audio from caption: {e}")
         return None
 css = """
-#col-container{
-    margin: 0 auto;
-    max-width: 800px;
-    }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
-    <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
-    <p style="text-align: center;">
-        ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
-    </p>
         """)
-    gr.Markdown("""
-    Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
-    descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
-    **💡 How it works:**
-    1. **Upload an image**: Choose an image that you'd like to analyze.
-    2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
-    3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
-       sound effect that matches the image context.
-    Enjoy the journey from visual to auditory sensation with just a few clicks!
-    """)
-    image_upload = gr.File(label="Upload Image", type="binary")
-    generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)
-    generate_sound_button = gr.Button("Generate Sound Effect")
-    audio_output = gr.Audio(label="Generated Sound Effect")
-    gr.Markdown("""
-    ## 👥 How You Can Contribute
-    We welcome contributions and suggestions for improvements. Your feedback is invaluable
-    to the continuous enhancement of this application.
-    For support, questions, or to contribute, please contact us at
-    [[email protected]](mailto:[email protected]).
-    Support our work and get involved by donating through
-    [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
-    """)
-    gr.Markdown("""
-    ## 📢 Stay Connected
-    This app is a testament to the creative possibilities that emerge when technology meets art.
-    Enjoy exploring the auditory landscape of your images!
-    """)
-    def update_caption(image_file):
-        description, _ = analyze_image_with_free_model(image_file)
-        return description
-    def generate_sound(description):
-        if not description or description.startswith("Error"):
-            return None
-        audio_path = get_audioldm_from_caption(description)
-        return audio_path
-    generate_description_button.click(
-        fn=update_caption,
         inputs=image_upload,
         outputs=caption_display
     )
-    generate_sound_button.click(
-        fn=generate_sound,
-        inputs=caption_display,
         outputs=audio_output
     )
-    gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
-    html = gr.HTML()
-demo.launch(debug=True, share=True)

 from scipy.io.wavfile import write
 from diffusers import DiffusionPipeline
 from transformers import pipeline
+from pydub import AudioSegment
+import numpy as np
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 device_id = 0 if torch.cuda.is_available() else -1
+# Initialize models
 captioning_pipeline = pipeline(
     "image-to-text",
     model="nlpconnect/vit-gpt2-image-captioning",
 )
 @spaces.GPU(duration=120)
+def analyze_image(image_file):
     try:
+        results = captioning_pipeline(image_file)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         caption = results[0].get("generated_text", "").strip()
+        return caption if caption else "No caption generated.", not bool(caption)
     except Exception as e:
         return f"Error analyzing image: {e}", True
 @spaces.GPU(duration=120)
+def generate_audio(prompt):
     try:
         pipe.to("cuda")
         audio_output = pipe(
+            prompt=prompt,
             num_inference_steps=50,
             guidance_scale=7.5
         )
         pipe.to("cpu")
+        return audio_output.audios[0]
+    except Exception as e:
+        print(f"Error generating audio: {e}")
+        return None
+def blend_audios(audio_list):
+    try:
+        # Find the longest audio duration
+        max_length = max([arr.shape[0] for arr in audio_list])
+        # Mix all audios
+        mixed = np.zeros(max_length)
+        for arr in audio_list:
+            if arr.shape[0] < max_length:
+                padded = np.pad(arr, (0, max_length - arr.shape[0]))
+            else:
+                padded = arr[:max_length]
+            mixed += padded
+        # Normalize the audio
+        mixed = mixed / np.max(np.abs(mixed))
+        # Save to temporary file
+        _, tmp_path = tempfile.mkstemp(suffix=".wav")
+        write(tmp_path, 16000, mixed)
+        return tmp_path
     except Exception as e:
+        print(f"Error blending audio: {e}")
         return None
 css = """
+#col-container { max-width: 800px; margin: 0 auto; }
+.toggle-row { margin: 1rem 0; }
+.prompt-box { margin-bottom: 0.5rem; }
 """
 with gr.Blocks(css=css) as demo:
     with gr.Column(elem_id="col-container"):
         gr.HTML("""
+        <h1 style="text-align: center;">🎶 Advanced Sound Generator</h1>
+        <p style="text-align: center;">⚡ Powered by Bilsimaging</p>
+        """)
+        # Input mode toggle
+        input_mode = gr.Radio(
+            choices=["Image Input", "Text Prompts"],
+            value="Image Input",
+            label="Select Input Mode",
+            elem_classes="toggle-row"
+        )
+        # Image input section
+        with gr.Column(visible=True) as image_col:
+            image_upload = gr.Image(type="filepath", label="Upload Image")
+            generate_desc_btn = gr.Button("Generate Description from Image")
+            caption_display = gr.Textbox(label="Generated Description", interactive=False)
+        # Text input section
+        with gr.Column(visible=False) as text_col:
+            with gr.Row():
+                prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2)
+                prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2)
+            additional_prompts = gr.Column()
+            add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
+            generate_sound_btn = gr.Button("Generate Blended Sound", variant="primary")
+        # Audio output
+        audio_output = gr.Audio(label="Final Sound Composition", interactive=False)
+        # Documentation section
+        gr.Markdown("""
+        ## 🎚️ How to Use
+        1. **Choose Input Mode** above
+        2. For images: Upload + Generate Description → Generate Sound
+        3. For text: Enter multiple sound prompts → Generate Blended Sound
+        [Support on Ko-fi](https://ko-fi.com/bilsimaging)
         """)
+        # Visitor badge
+        gr.HTML("""
+        <div style="text-align: center; margin-top: 2rem;">
+            <a href="https://visitorbadge.io/status?path=YOUR_SPACE_URL">
+                <img src="https://api.visitorbadge.io/api/visitors?path=YOUR_SPACE_URL&countColor=%23263759"/>
+            </a>
+        </div>
+        """)
+    # Toggle visibility based on input mode
+    def toggle_input(mode):
+        if mode == "Image Input":
+            return [gr.update(visible=True), gr.update(visible=False)]
+        return [gr.update(visible=False), gr.update(visible=True)]
+    input_mode.change(
+        fn=toggle_input,
+        inputs=input_mode,
+        outputs=[image_col, text_col]
+    )
+    # Image processing chain
+    generate_desc_btn.click(
+        fn=analyze_image,
         inputs=image_upload,
         outputs=caption_display
+    ).then(
+        fn=lambda: gr.update(interactive=True),
+        outputs=generate_sound_btn
     )
+    # Text processing chain
+    generate_sound_btn.click(
+        fn=lambda *prompts: [p for p in prompts if p.strip()],
+        inputs=[prompt1, prompt2],
+        outputs=[]
+    ).then(
+        fn=lambda prompts: [generate_audio(p) for p in prompts],
+        outputs=[]
+    ).then(
+        fn=blend_audios,
         outputs=audio_output
     )
+# Queue management
+demo.queue(concurrency_count=2)
+if __name__ == "__main__":
+    demo.launch()