Bils commited on
Commit
2f15cbe
·
verified ·
1 Parent(s): 8b471a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -104
app.py CHANGED
@@ -7,139 +7,216 @@ import torch
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from pathlib import Path
 
11
 
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
- device_id = 0 if torch.cuda.is_available() else -1
16
-
17
- captioning_pipeline = pipeline(
18
- "image-to-text",
19
- model="nlpconnect/vit-gpt2-image-captioning",
20
- device=device_id
21
- )
22
-
23
- pipe = DiffusionPipeline.from_pretrained(
24
- "cvssp/audioldm2",
25
- use_auth_token=hf_token
26
- )
27
 
 
28
  @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
30
- try:
31
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
- temp_file.write(image_file)
33
- temp_image_path = temp_file.name
 
 
 
 
 
 
 
34
 
35
- results = captioning_pipeline(temp_image_path)
36
- if not results or not isinstance(results, list):
37
- return "Error: Could not generate caption.", True
38
-
39
- caption = results[0].get("generated_text", "").strip()
40
- if not caption:
41
- return "No caption was generated.", True
42
- return caption, False
43
 
 
 
 
 
 
 
 
 
44
  except Exception as e:
45
- return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
- def get_audioldm_from_caption(caption):
 
49
  try:
50
- pipe.to("cuda")
51
- audio_output = pipe(
52
- prompt=caption,
53
  num_inference_steps=50,
54
  guidance_scale=7.5
55
- )
56
- pipe.to("cpu")
57
- audio = audio_output.audios[0]
58
-
59
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
- write(temp_wav.name, 16000, audio)
61
- return temp_wav.name
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
- print(f"Error generating audio from caption: {e}")
65
  return None
66
 
67
  css = """
68
- #col-container{
69
- margin: 0 auto;
70
- max-width: 800px;
71
- }
72
  """
73
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
 
76
  gr.HTML("""
77
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
- <p style="text-align: center;">
79
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
- </p>
81
  """)
82
 
83
- gr.Markdown("""
84
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
-
87
- **💡 How it works:**
88
- 1. **Upload an image**: Choose an image that you'd like to analyze.
89
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
- sound effect that matches the image context.
92
-
93
- Enjoy the journey from visual to auditory sensation with just a few clicks!
94
- """)
95
-
96
- image_upload = gr.File(label="Upload Image", type="binary")
97
- generate_description_button = gr.Button("Generate Description")
98
- caption_display = gr.Textbox(label="Image Description", interactive=False)
99
- generate_sound_button = gr.Button("Generate Sound Effect")
100
- audio_output = gr.Audio(label="Generated Sound Effect")
101
-
102
- gr.Markdown("""
103
- ## 👥 How You Can Contribute
104
- We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
- to the continuous enhancement of this application.
106
-
107
- For support, questions, or to contribute, please contact us at
108
109
-
110
- Support our work and get involved by donating through
111
- [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
- """)
113
-
114
- gr.Markdown("""
115
- ## 📢 Stay Connected
116
- This app is a testament to the creative possibilities that emerge when technology meets art.
117
- Enjoy exploring the auditory landscape of your images!
118
- """)
119
-
120
- def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
122
- return description
123
-
124
- def generate_sound(description):
125
- if not description or description.startswith("Error"):
126
- return None
127
- audio_path = get_audioldm_from_caption(description)
128
- return audio_path
129
 
130
- generate_description_button.click(
131
- fn=update_caption,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  inputs=image_upload,
133
- outputs=caption_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  )
135
 
136
- generate_sound_button.click(
137
- fn=generate_sound,
138
- inputs=caption_display,
139
- outputs=audio_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  )
141
-
142
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
- html = gr.HTML()
144
 
145
- demo.launch(debug=True, share=True)
 
 
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from pydub import AudioSegment
11
+ import numpy as np
12
 
13
+ # Load environment variables
14
  load_dotenv()
15
  hf_token = os.getenv("HF_TKN")
16
 
17
+ # Device configuration
18
+ device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ torch_dtype = torch.float16 if device == "cuda" else torch.float32
 
 
 
 
 
 
 
 
 
20
 
21
+ # Initialize models with automatic device detection
22
  @spaces.GPU(duration=120)
23
+ def load_models():
24
+ global captioning_pipeline, pipe
25
+ captioning_pipeline = pipeline(
26
+ "image-to-text",
27
+ model="nlpconnect/vit-gpt2-image-captioning",
28
+ device=0 if torch.cuda.is_available() else -1
29
+ )
30
+ pipe = DiffusionPipeline.from_pretrained(
31
+ "cvssp/audioldm2",
32
+ use_auth_token=hf_token,
33
+ torch_dtype=torch_dtype
34
+ ).to(device)
35
 
36
+ load_models()
 
 
 
 
 
 
 
37
 
38
+ @spaces.GPU(duration=60)
39
+ def analyze_image(image_file):
40
+ """Generate caption from image with error handling"""
41
+ try:
42
+ results = captioning_pipeline(image_file)
43
+ if results and isinstance(results, list):
44
+ return results[0].get("generated_text", "").strip()
45
+ return "Could not generate caption"
46
  except Exception as e:
47
+ return f"Error: {str(e)}"
48
 
49
  @spaces.GPU(duration=120)
50
+ def generate_audio(prompt):
51
+ """Generate audio from text prompt"""
52
  try:
53
+ return pipe(
54
+ prompt=prompt,
 
55
  num_inference_steps=50,
56
  guidance_scale=7.5
57
+ ).audios[0]
58
+ except Exception as e:
59
+ print(f"Audio generation error: {str(e)}")
60
+ return None
 
 
 
61
 
62
+ def blend_audios(audio_list):
63
+ """Mix multiple audio arrays into one"""
64
+ try:
65
+ valid_audios = [arr for arr in audio_list if arr is not None]
66
+ if not valid_audios:
67
+ return None
68
+
69
+ max_length = max(arr.shape[0] for arr in valid_audios)
70
+ mixed = np.zeros(max_length)
71
+
72
+ for arr in valid_audios:
73
+ if arr.shape[0] < max_length:
74
+ padded = np.pad(arr, (0, max_length - arr.shape[0]))
75
+ else:
76
+ padded = arr[:max_length]
77
+ mixed += padded
78
+
79
+ mixed = mixed / np.max(np.abs(mixed))
80
+ _, tmp_path = tempfile.mkstemp(suffix=".wav")
81
+ write(tmp_path, 16000, mixed)
82
+ return tmp_path
83
  except Exception as e:
84
+ print(f"Blending error: {str(e)}")
85
  return None
86
 
87
  css = """
88
+ #col-container { max-width: 800px; margin: 0 auto; }
89
+ .toggle-row { margin: 1rem 0; }
90
+ .prompt-box { margin-bottom: 0.5rem; }
91
+ .danger { color: #ff4444; font-weight: bold; }
92
  """
93
 
94
  with gr.Blocks(css=css) as demo:
95
  with gr.Column(elem_id="col-container"):
96
+ # Header Section
97
  gr.HTML("""
98
+ <h1 style="text-align: center;">🎶 Generate Sound Effects from Image or Text</h1>
99
+ <p style="text-align: center;">
100
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
101
+ </p>
102
  """)
103
 
104
+ # Input Mode Toggle
105
+ input_mode = gr.Radio(
106
+ choices=["Image Input", "Text Input"],
107
+ value="Image Input",
108
+ label="Select Input Mode",
109
+ elem_classes="toggle-row"
110
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ # Image Input Section
113
+ with gr.Column(visible=True) as image_col:
114
+ image_upload = gr.Image(type="filepath", label="Upload Image")
115
+ generate_desc_btn = gr.Button("Generate Description from Image", variant="primary")
116
+ caption_display = gr.Textbox(label="Generated Description", interactive=False)
117
+
118
+ # Text Input Section
119
+ with gr.Column(visible=False) as text_col:
120
+ with gr.Row():
121
+ prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2, placeholder="Enter sound description...")
122
+ prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2, placeholder="Enter sound description...")
123
+ additional_prompts = gr.Column()
124
+ add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
125
+ gr.Markdown("<div class='danger'>Max 5 prompts for stability</div>")
126
+
127
+ # Generation Controls
128
+ generate_sound_btn = gr.Button("Generate Sound Effect", variant="primary")
129
+ audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
130
+
131
+ # Documentation Section
132
+ gr.Markdown("""
133
+ ## 👥 How You Can Contribute
134
+ We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
135
+ Support us on [Ko-fi](https://ko-fi.com/bilsimaging) - Bilel Aroua
136
+ """)
137
+
138
+ # Visitor Badge
139
+ gr.HTML("""
140
+ <div style="text-align: center;">
141
+ <a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
142
+ <img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759"/>
143
+ </a>
144
+ </div>
145
+ """)
146
+
147
+ # Input Mode Toggle Handler
148
+ input_mode.change(
149
+ lambda mode: (gr.update(visible=mode == "Image Input"), gr.update(visible=mode == "Text Input")),
150
+ inputs=input_mode,
151
+ outputs=[image_col, text_col],
152
+ concurrency_limit=1
153
+ )
154
+
155
+ # Image Description Generation
156
+ generate_desc_btn.click(
157
+ analyze_image,
158
  inputs=image_upload,
159
+ outputs=caption_display,
160
+ concurrency_limit=2
161
+ )
162
+
163
+ # Dynamic Prompt Addition
164
+ def add_prompt(current_count):
165
+ if current_count >= 5:
166
+ return current_count, gr.update()
167
+ new_count = current_count + 1
168
+ new_prompt = gr.Textbox(
169
+ label=f"Sound Prompt {new_count}",
170
+ lines=2,
171
+ visible=True,
172
+ placeholder="Enter sound description..."
173
+ )
174
+ return new_count, new_prompt
175
+
176
+ prompt_count = gr.State(2)
177
+ add_prompt_btn.click(
178
+ add_prompt,
179
+ inputs=prompt_count,
180
+ outputs=[prompt_count, additional_prompts],
181
+ concurrency_limit=1
182
  )
183
 
184
+ # Sound Generation Handler
185
+ def process_inputs(mode, image_file, caption, *prompts):
186
+ try:
187
+ if mode == "Image Input":
188
+ if not image_file:
189
+ raise gr.Error("Please upload an image")
190
+ caption = analyze_image(image_file)
191
+ prompts = [caption]
192
+ else:
193
+ prompts = [p.strip() for p in prompts if p.strip()]
194
+ if not prompts:
195
+ raise gr.Error("Please enter at least one valid prompt")
196
+
197
+ # Generate individual audio tracks
198
+ audio_tracks = []
199
+ for prompt in prompts:
200
+ if not prompt:
201
+ continue
202
+ audio = generate_audio(prompt)
203
+ if audio is not None:
204
+ audio_tracks.append(audio)
205
+
206
+ # Blend audio tracks
207
+ if not audio_tracks:
208
+ return None
209
+ return blend_audios(audio_tracks)
210
+
211
+ except Exception as e:
212
+ raise gr.Error(f"Processing error: {str(e)}")
213
+
214
+ generate_sound_btn.click(
215
+ process_inputs,
216
+ inputs=[input_mode, image_upload, caption_display, prompt1, prompt2],
217
+ outputs=audio_output,
218
+ concurrency_limit=2
219
  )
 
 
 
220
 
221
+ if __name__ == "__main__":
222
+ demo.launch(max_threads=4)