Bils commited on
Commit
a4f881b
·
verified ·
1 Parent(s): 95e77bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -80
app.py CHANGED
@@ -7,13 +7,15 @@ import torch
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from pathlib import Path
 
11
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
14
 
15
  device_id = 0 if torch.cuda.is_available() else -1
16
 
 
17
  captioning_pipeline = pipeline(
18
  "image-to-text",
19
  model="nlpconnect/vit-gpt2-image-captioning",
@@ -26,120 +28,151 @@ pipe = DiffusionPipeline.from_pretrained(
26
  )
27
 
28
  @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
30
  try:
31
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
- temp_file.write(image_file)
33
- temp_image_path = temp_file.name
34
-
35
- results = captioning_pipeline(temp_image_path)
36
  if not results or not isinstance(results, list):
37
  return "Error: Could not generate caption.", True
38
 
39
  caption = results[0].get("generated_text", "").strip()
40
- if not caption:
41
- return "No caption was generated.", True
42
- return caption, False
43
-
44
  except Exception as e:
45
  return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
- def get_audioldm_from_caption(caption):
49
  try:
50
  pipe.to("cuda")
51
  audio_output = pipe(
52
- prompt=caption,
53
  num_inference_steps=50,
54
  guidance_scale=7.5
55
  )
56
  pipe.to("cpu")
57
- audio = audio_output.audios[0]
58
-
59
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
- write(temp_wav.name, 16000, audio)
61
- return temp_wav.name
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
- print(f"Error generating audio from caption: {e}")
65
  return None
66
 
67
  css = """
68
- #col-container{
69
- margin: 0 auto;
70
- max-width: 800px;
71
- }
72
  """
73
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
76
  gr.HTML("""
77
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
- <p style="text-align: center;">
79
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  """)
82
 
83
- gr.Markdown("""
84
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
-
87
- **💡 How it works:**
88
- 1. **Upload an image**: Choose an image that you'd like to analyze.
89
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
- sound effect that matches the image context.
92
-
93
- Enjoy the journey from visual to auditory sensation with just a few clicks!
94
- """)
95
-
96
- image_upload = gr.File(label="Upload Image", type="binary")
97
- generate_description_button = gr.Button("Generate Description")
98
- caption_display = gr.Textbox(label="Image Description", interactive=False)
99
- generate_sound_button = gr.Button("Generate Sound Effect")
100
- audio_output = gr.Audio(label="Generated Sound Effect")
101
 
102
- gr.Markdown("""
103
- ## 👥 How You Can Contribute
104
- We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
- to the continuous enhancement of this application.
 
106
 
107
- For support, questions, or to contribute, please contact us at
108
109
-
110
- Support our work and get involved by donating through
111
- [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
- """)
113
-
114
- gr.Markdown("""
115
- ## 📢 Stay Connected
116
- This app is a testament to the creative possibilities that emerge when technology meets art.
117
- Enjoy exploring the auditory landscape of your images!
118
- """)
119
-
120
- def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
122
- return description
123
-
124
- def generate_sound(description):
125
- if not description or description.startswith("Error"):
126
- return None
127
- audio_path = get_audioldm_from_caption(description)
128
- return audio_path
129
 
130
- generate_description_button.click(
131
- fn=update_caption,
 
132
  inputs=image_upload,
133
  outputs=caption_display
 
 
 
134
  )
135
 
136
- generate_sound_button.click(
137
- fn=generate_sound,
138
- inputs=caption_display,
 
 
 
 
 
 
 
139
  outputs=audio_output
140
  )
141
-
142
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
- html = gr.HTML()
144
 
145
- demo.launch(debug=True, share=True)
 
 
 
 
 
7
  from scipy.io.wavfile import write
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
+ from pydub import AudioSegment
11
+ import numpy as np
12
 
13
  load_dotenv()
14
  hf_token = os.getenv("HF_TKN")
15
 
16
  device_id = 0 if torch.cuda.is_available() else -1
17
 
18
+ # Initialize models
19
  captioning_pipeline = pipeline(
20
  "image-to-text",
21
  model="nlpconnect/vit-gpt2-image-captioning",
 
28
  )
29
 
30
  @spaces.GPU(duration=120)
31
+ def analyze_image(image_file):
32
  try:
33
+ results = captioning_pipeline(image_file)
 
 
 
 
34
  if not results or not isinstance(results, list):
35
  return "Error: Could not generate caption.", True
36
 
37
  caption = results[0].get("generated_text", "").strip()
38
+ return caption if caption else "No caption generated.", not bool(caption)
 
 
 
39
  except Exception as e:
40
  return f"Error analyzing image: {e}", True
41
 
42
  @spaces.GPU(duration=120)
43
+ def generate_audio(prompt):
44
  try:
45
  pipe.to("cuda")
46
  audio_output = pipe(
47
+ prompt=prompt,
48
  num_inference_steps=50,
49
  guidance_scale=7.5
50
  )
51
  pipe.to("cpu")
52
+ return audio_output.audios[0]
53
+ except Exception as e:
54
+ print(f"Error generating audio: {e}")
55
+ return None
 
56
 
57
+ def blend_audios(audio_list):
58
+ try:
59
+ # Find the longest audio duration
60
+ max_length = max([arr.shape[0] for arr in audio_list])
61
+
62
+ # Mix all audios
63
+ mixed = np.zeros(max_length)
64
+ for arr in audio_list:
65
+ if arr.shape[0] < max_length:
66
+ padded = np.pad(arr, (0, max_length - arr.shape[0]))
67
+ else:
68
+ padded = arr[:max_length]
69
+ mixed += padded
70
+
71
+ # Normalize the audio
72
+ mixed = mixed / np.max(np.abs(mixed))
73
+
74
+ # Save to temporary file
75
+ _, tmp_path = tempfile.mkstemp(suffix=".wav")
76
+ write(tmp_path, 16000, mixed)
77
+ return tmp_path
78
  except Exception as e:
79
+ print(f"Error blending audio: {e}")
80
  return None
81
 
82
  css = """
83
+ #col-container { max-width: 800px; margin: 0 auto; }
84
+ .toggle-row { margin: 1rem 0; }
85
+ .prompt-box { margin-bottom: 0.5rem; }
 
86
  """
87
 
88
  with gr.Blocks(css=css) as demo:
89
  with gr.Column(elem_id="col-container"):
90
  gr.HTML("""
91
+ <h1 style="text-align: center;">🎶 Advanced Sound Generator</h1>
92
+ <p style="text-align: center;">⚡ Powered by Bilsimaging</p>
93
+ """)
94
+
95
+ # Input mode toggle
96
+ input_mode = gr.Radio(
97
+ choices=["Image Input", "Text Prompts"],
98
+ value="Image Input",
99
+ label="Select Input Mode",
100
+ elem_classes="toggle-row"
101
+ )
102
+
103
+ # Image input section
104
+ with gr.Column(visible=True) as image_col:
105
+ image_upload = gr.Image(type="filepath", label="Upload Image")
106
+ generate_desc_btn = gr.Button("Generate Description from Image")
107
+ caption_display = gr.Textbox(label="Generated Description", interactive=False)
108
+
109
+ # Text input section
110
+ with gr.Column(visible=False) as text_col:
111
+ with gr.Row():
112
+ prompt1 = gr.Textbox(label="Sound Prompt 1", lines=2)
113
+ prompt2 = gr.Textbox(label="Sound Prompt 2", lines=2)
114
+ additional_prompts = gr.Column()
115
+ add_prompt_btn = gr.Button("➕ Add Another Prompt", variant="secondary")
116
+ generate_sound_btn = gr.Button("Generate Blended Sound", variant="primary")
117
+
118
+ # Audio output
119
+ audio_output = gr.Audio(label="Final Sound Composition", interactive=False)
120
+
121
+ # Documentation section
122
+ gr.Markdown("""
123
+ ## 🎚️ How to Use
124
+ 1. **Choose Input Mode** above
125
+ 2. For images: Upload + Generate Description → Generate Sound
126
+ 3. For text: Enter multiple sound prompts → Generate Blended Sound
127
+ [Support on Ko-fi](https://ko-fi.com/bilsimaging)
128
  """)
129
 
130
+ # Visitor badge
131
+ gr.HTML("""
132
+ <div style="text-align: center; margin-top: 2rem;">
133
+ <a href="https://visitorbadge.io/status?path=YOUR_SPACE_URL">
134
+ <img src="https://api.visitorbadge.io/api/visitors?path=YOUR_SPACE_URL&countColor=%23263759"/>
135
+ </a>
136
+ </div>
137
+ """)
 
 
 
 
 
 
 
 
 
 
138
 
139
+ # Toggle visibility based on input mode
140
+ def toggle_input(mode):
141
+ if mode == "Image Input":
142
+ return [gr.update(visible=True), gr.update(visible=False)]
143
+ return [gr.update(visible=False), gr.update(visible=True)]
144
 
145
+ input_mode.change(
146
+ fn=toggle_input,
147
+ inputs=input_mode,
148
+ outputs=[image_col, text_col]
149
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
+ # Image processing chain
152
+ generate_desc_btn.click(
153
+ fn=analyze_image,
154
  inputs=image_upload,
155
  outputs=caption_display
156
+ ).then(
157
+ fn=lambda: gr.update(interactive=True),
158
+ outputs=generate_sound_btn
159
  )
160
 
161
+ # Text processing chain
162
+ generate_sound_btn.click(
163
+ fn=lambda *prompts: [p for p in prompts if p.strip()],
164
+ inputs=[prompt1, prompt2],
165
+ outputs=[]
166
+ ).then(
167
+ fn=lambda prompts: [generate_audio(p) for p in prompts],
168
+ outputs=[]
169
+ ).then(
170
+ fn=blend_audios,
171
  outputs=audio_output
172
  )
 
 
 
173
 
174
+ # Queue management
175
+ demo.queue(concurrency_count=2)
176
+
177
+ if __name__ == "__main__":
178
+ demo.launch()