File size: 8,974 Bytes
4d9e689
2019ee0
213e5d3
81b2481
4d9e689
213e5d3
4d9e689
213e5d3
 
4d9e689
 
 
 
aec51c9
213e5d3
4d9e689
213e5d3
4d9e689
 
 
 
213e5d3
4d9e689
 
 
 
 
 
 
 
213e5d3
4d9e689
 
 
 
 
 
 
213e5d3
4d9e689
 
24da5c3
213e5d3
4d9e689
 
81b2481
4d9e689
 
 
 
 
 
 
213e5d3
4d9e689
213e5d3
4d9e689
213e5d3
 
 
4d9e689
 
 
213e5d3
81b2481
4d9e689
81b2481
213e5d3
4d9e689
 
24da5c3
4d9e689
 
 
 
 
 
 
 
 
 
3229fa2
4d9e689
 
 
81b2481
2019ee0
4d9e689
213e5d3
4d9e689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172038e
4d9e689
 
 
 
172038e
 
4d9e689
 
 
 
 
213e5d3
4d9e689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213e5d3
4d9e689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213e5d3
4d9e689
 
 
 
 
213e5d3
 
4d9e689
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import gradio as gr
import os
import tempfile
import torch
import numpy as np
from scipy.io.wavfile import write
from dotenv import load_dotenv
from diffusers import DiffusionPipeline
from transformers import pipeline
from PIL import Image
import io
from pydub import AudioSegment
from typing import List
import spaces

# Load environment variables
load_dotenv()
HF_TOKEN = os.getenv("HF_TKN")

# Device configuration
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize models
@gr.cache()
def load_caption_model():
    return pipeline(
        "image-to-text",
        model="Salesforce/blip-image-captioning-base",
        device=device
    )

@gr.cache()
def load_audio_model():
    pipe = DiffusionPipeline.from_pretrained(
        "cvssp/audioldm2",
        use_auth_token=HF_TOKEN
    )
    return pipe

caption_pipe = load_caption_model()
audio_pipe = load_audio_model().to(device)

@spaces.GPU(duration=120)
def analyze_image(image_file):
    """Generate caption from image with validation"""
    try:
        # Validate image
        try:
            image = Image.open(io.BytesIO(image_file))
            image.verify()
            image = Image.open(io.BytesIO(image_file))
        except Exception as e:
            raise ValueError(f"Invalid image file: {str(e)}")

        results = caption_pipe(image)
        if not results or not isinstance(results, list):
            raise RuntimeError("No caption generated")
        
        caption = results[0].get("generated_text", "").strip()
        if not caption:
            raise RuntimeError("Empty caption generated")
            
        return caption

    except Exception as e:
        raise gr.Error(f"Image processing error: {str(e)}")

@spaces.GPU(duration=120)
def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
    """Generate audio from single prompt"""
    try:
        if not prompt or len(prompt) < 10:
            raise ValueError("Prompt must be at least 10 characters")
            
        with torch.inference_mode():
            audio = audio_pipe(
                prompt=prompt,
                num_inference_steps=int(num_steps),
                guidance_scale=guidance_scale,
                audio_length_in_s=10
            ).audios[0]

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            write(tmpfile.name, 16000, audio)
            return tmpfile.name

    except Exception as e:
        raise gr.Error(f"Audio generation error: {str(e)}")

@spaces.GPU(duration=120)
def blend_audios(audio_files: List[str]) -> str:
    """Mix multiple audio files into one"""
    try:
        if not audio_files:
            raise ValueError("No audio files to blend")
            
        # Load first audio to get base parameters
        base_audio = AudioSegment.from_wav(audio_files[0])
        mixed = base_audio
        
        # Mix subsequent tracks
        for file in audio_files[1:]:
            track = AudioSegment.from_wav(file)
            if len(track) > len(mixed):
                mixed = mixed.overlay(track[:len(mixed)])
            else:
                mixed = mixed.overlay(track)
                
        # Export mixed audio
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
            mixed.export(tmpfile.name, format="wav")
            return tmpfile.name
            
    except Exception as e:
        raise gr.Error(f"Audio mixing error: {str(e)}")

def process_inputs(input_choice, image_file, *prompts):
    """Handle both image and text input modes"""
    try:
        # Filter empty prompts
        valid_prompts = [p.strip() for p in prompts if p.strip()]
        
        if input_choice == "Image":
            if not image_file:
                raise gr.Error("Please upload an image")
            main_prompt = analyze_image(image_file)
            valid_prompts = [main_prompt] + valid_prompts
        else:
            if not valid_prompts:
                raise gr.Error("Please enter at least one text prompt")
                
        # Generate audio for each prompt
        audio_files = []
        for idx, prompt in enumerate(valid_prompts):
            audio_path = generate_audio(prompt)
            audio_files.append(audio_path)
            
        # Blend all audio files
        final_audio = blend_audios(audio_files)
        return valid_prompts, final_audio, audio_files

    except Exception as e:
        raise gr.Error(str(e))

# Gradio interface
css = """
#main-container { max-width: 800px; margin: 0 auto; }
.dark { background: #1a1a1a; }
.prompt-box { margin-bottom: 10px; }
.audio-track { margin: 5px 0; }
"""

with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
    with gr.Column(elem_id="main-container"):
        gr.Markdown("""
        # 🎨 Image to Sound Generator
        Transform visual content or text prompts into mixed sound effects!
        """)
        
        # Input Mode Selector
        input_choice = gr.Radio(
            choices=["Image", "Text"],
            value="Image",
            label="Input Mode",
            interactive=True
        )
        
        # Image Input Section
        with gr.Row(visible=True) as image_row:
            image_input = gr.Image(type="filepath", label="Upload Image")
        
        # Text Input Section
        with gr.Column(visible=False) as text_inputs_col:
            prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
            add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")
        
        # Dynamic prompt management
        current_prompts = gr.State(value=3)
        
        def add_prompt(current_count):
            new_count = current_count + 1
            new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
            return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)
        
        add_prompt_btn.click(
            fn=add_prompt,
            inputs=current_prompts,
            outputs=[current_prompts] + prompt_components + [text_inputs_col]
        )
        
        # Toggle between image/text inputs
        def toggle_inputs(choice):
            if choice == "Image":
                return [gr.update(visible=True), gr.update(visible=False)]
            return [gr.update(visible=False), gr.update(visible=True)]
        
        input_choice.change(
            fn=toggle_inputs,
            inputs=input_choice,
            outputs=[image_row, text_inputs_col]
        )
        
        # Generation Controls
        with gr.Accordion("Advanced Settings", open=False):
            steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
            guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")
        
        generate_btn = gr.Button("Generate Mixed Sound", variant="primary")
        
        # Outputs
        with gr.Column():
            gr.Markdown("### Generation Results")
            prompt_display = gr.JSON(label="Used Prompts")
            final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)
            
            with gr.Accordion("Individual Tracks", open=False):
                track_components = [gr.Audio(visible=False) for _ in range(5)]
        
        # Examples
        gr.Examples(
            examples=[
                ["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
                [None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
            ],
            inputs=[image_input] + prompt_components[:2],
            outputs=[prompt_display, final_audio],
            fn=lambda *x: process_inputs("Image", *x),
            cache_examples=True
        )

        # Contribution Section
        with gr.Column():
            gr.Markdown("""
            ## 👥 How You Can Contribute
            We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
            Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
            """)
            gr.HTML("""
            <div style="text-align: center;">
                <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
                    <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
                </a>
            </div>
            """)

        # Footer
        gr.Markdown("""
        ---
        [GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
        """)

    # Event handling
    generate_btn.click(
        fn=process_inputs,
        inputs=[input_choice, image_input] + prompt_components,
        outputs=[prompt_display, final_audio, *track_components]
    )

if __name__ == "__main__":
    app.launch(debug=True, share=True)