Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

File size: 8,974 Bytes

import gradio as gr
import os
import tempfile
import torch
import numpy as np
from scipy.io.wavfile import write
from dotenv import load_dotenv
from diffusers import DiffusionPipeline
from transformers import pipeline
from PIL import Image
import io
from pydub import AudioSegment
from typing import List
import spaces

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TKN")

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize models
@gr.cache()
def load_caption_model():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=device
    )

@gr.cache()
def load_audio_model():
    pipe = DiffusionPipeline.from_pretrained(
        "cvssp/audioldm2",
        use_auth_token=HF_TOKEN
    )
    return pipe

caption_pipe = load_caption_model()
audio_pipe = load_audio_model().to(device)

@spaces.GPU(duration=120)
def analyze_image(image_file):
    """Generate caption from image with validation"""
    try:
        # Validate image
        try:
            image = Image.open(io.BytesIO(image_file))
            image.verify()
            image = Image.open(io.BytesIO(image_file))
        except Exception as e:
            raise ValueError(f"Invalid image file: {str(e)}")

        results = caption_pipe(image)
        if not results or not isinstance(results, list):
            raise RuntimeError("No caption generated")
        
        caption = results[0].get("generated_text", "").strip()
        if not caption:
            raise RuntimeError("Empty caption generated")
            
        return caption

    except Exception as e:
        raise gr.Error(f"Image processing error: {str(e)}")

@spaces.GPU(duration=120)
def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
    """Generate audio from single prompt"""
    try:
        if not prompt or len(prompt) < 10:
            raise ValueError("Prompt must be at least 10 characters")
            
        with torch.inference_mode():
            audio = audio_pipe(
                prompt=prompt,
                num_inference_steps=int(num_steps),
                guidance_scale=guidance_scale,
                audio_length_in_s=10
            ).audios[0]

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            write(tmpfile.name, 16000, audio)
            return tmpfile.name

    except Exception as e:
        raise gr.Error(f"Audio generation error: {str(e)}")

@spaces.GPU(duration=120)
def blend_audios(audio_files: List[str]) -> str:
    """Mix multiple audio files into one"""
    try:
        if not audio_files:
            raise ValueError("No audio files to blend")
            
        # Load first audio to get base parameters
        base_audio = AudioSegment.from_wav(audio_files[0])
        mixed = base_audio
        
        # Mix subsequent tracks
        for file in audio_files[1:]:
            track = AudioSegment.from_wav(file)
            if len(track) > len(mixed):
                mixed = mixed.overlay(track[:len(mixed)])
            else:
                mixed = mixed.overlay(track)
                
        # Export mixed audio
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            mixed.export(tmpfile.name, format="wav")
            return tmpfile.name
            
    except Exception as e:
        raise gr.Error(f"Audio mixing error: {str(e)}")

def process_inputs(input_choice, image_file, *prompts):
    """Handle both image and text input modes"""
    try:
        # Filter empty prompts
        valid_prompts = [p.strip() for p in prompts if p.strip()]
        
        if input_choice == "Image":
            if not image_file:
                raise gr.Error("Please upload an image")
            main_prompt = analyze_image(image_file)
            valid_prompts = [main_prompt] + valid_prompts
        else:
            if not valid_prompts:
                raise gr.Error("Please enter at least one text prompt")
                
        # Generate audio for each prompt
        audio_files = []
        for idx, prompt in enumerate(valid_prompts):
            audio_path = generate_audio(prompt)
            audio_files.append(audio_path)
            
        # Blend all audio files
        final_audio = blend_audios(audio_files)
        return valid_prompts, final_audio, audio_files

    except Exception as e:
        raise gr.Error(str(e))

# Gradio interface
css = """
#main-container { max-width: 800px; margin: 0 auto; }
.dark { background: #1a1a1a; }
.prompt-box { margin-bottom: 10px; }
.audio-track { margin: 5px 0; }
"""

with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
    with gr.Column(elem_id="main-container"):
        gr.Markdown("""
        # 🎨 Image to Sound Generator
        Transform visual content or text prompts into mixed sound effects!
        """)
        
        # Input Mode Selector
        input_choice = gr.Radio(
            choices=["Image", "Text"],
            value="Image",
            label="Input Mode",
            interactive=True
        )
        
        # Image Input Section
        with gr.Row(visible=True) as image_row:
            image_input = gr.Image(type="filepath", label="Upload Image")
        
        # Text Input Section
        with gr.Column(visible=False) as text_inputs_col:
            prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
            add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")
        
        # Dynamic prompt management
        current_prompts = gr.State(value=3)
        
        def add_prompt(current_count):
            new_count = current_count + 1
            new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
            return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)
        
        add_prompt_btn.click(
            fn=add_prompt,
            inputs=current_prompts,
            outputs=[current_prompts] + prompt_components + [text_inputs_col]
        )
        
        # Toggle between image/text inputs
        def toggle_inputs(choice):
            if choice == "Image":
                return [gr.update(visible=True), gr.update(visible=False)]
            return [gr.update(visible=False), gr.update(visible=True)]
        
        input_choice.change(
            fn=toggle_inputs,
            inputs=input_choice,
            outputs=[image_row, text_inputs_col]
        )
        
        # Generation Controls
        with gr.Accordion("Advanced Settings", open=False):
            steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
            guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")
        
        generate_btn = gr.Button("Generate Mixed Sound", variant="primary")
        
        # Outputs
        with gr.Column():
            gr.Markdown("### Generation Results")
            prompt_display = gr.JSON(label="Used Prompts")
            final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)
            
            with gr.Accordion("Individual Tracks", open=False):
                track_components = [gr.Audio(visible=False) for _ in range(5)]
        
        # Examples
        gr.Examples(
            examples=[
                ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
                [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
            ],
            inputs=[image_input] + prompt_components[:2],
            outputs=[prompt_display, final_audio],
            fn=lambda *x: process_inputs("Image", *x),
            cache_examples=True
        )

        # Contribution Section
        with gr.Column():
            gr.Markdown("""
            ## 👥 How You Can Contribute
            We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
            Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
            """)
            gr.HTML("""
            <div style="text-align: center;">
                <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
                    <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
                </a>
            </div>
            """)

        # Footer
        gr.Markdown("""
        ---
        [GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
        """)

    # Event handling
    generate_btn.click(
        fn=process_inputs,
        inputs=[input_choice, image_input] + prompt_components,
        outputs=[prompt_display, final_audio, *track_components]
    )

if __name__ == "__main__":
    app.launch(debug=True, share=True)