Bils commited on
Commit
ccdc62f
·
verified ·
1 Parent(s): 745586c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -107
app.py CHANGED
@@ -1,145 +1,164 @@
1
- import spaces
2
- import os
3
- import tempfile
4
  import gradio as gr
5
- from dotenv import load_dotenv
6
  import torch
7
- from scipy.io.wavfile import write
 
8
  from diffusers import DiffusionPipeline
9
  from transformers import pipeline
10
- from pathlib import Path
11
 
 
12
  load_dotenv()
13
  hf_token = os.getenv("HF_TKN")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- device_id = 0 if torch.cuda.is_available() else -1
16
-
17
- captioning_pipeline = pipeline(
18
- "image-to-text",
19
- model="nlpconnect/vit-gpt2-image-captioning",
20
- device=device_id
21
- )
22
 
23
- pipe = DiffusionPipeline.from_pretrained(
24
- "cvssp/audioldm2",
25
- use_auth_token=hf_token
26
- )
27
 
28
- @spaces.GPU(duration=120)
29
- def analyze_image_with_free_model(image_file):
30
  try:
31
- with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
- temp_file.write(image_file)
33
- temp_image_path = temp_file.name
34
-
35
- results = captioning_pipeline(temp_image_path)
36
- if not results or not isinstance(results, list):
37
- return "Error: Could not generate caption.", True
38
 
 
 
 
39
  caption = results[0].get("generated_text", "").strip()
40
- if not caption:
41
- return "No caption was generated.", True
42
- return caption, False
43
-
44
  except Exception as e:
45
- return f"Error analyzing image: {e}", True
46
 
47
  @spaces.GPU(duration=120)
48
- def get_audioldm_from_caption(caption):
 
49
  try:
50
- pipe.to("cuda")
51
- audio_output = pipe(
 
 
 
 
52
  prompt=caption,
53
  num_inference_steps=50,
54
- guidance_scale=7.5
55
- )
56
- pipe.to("cpu")
57
- audio = audio_output.audios[0]
58
-
59
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
60
- write(temp_wav.name, 16000, audio)
61
- return temp_wav.name
62
-
63
  except Exception as e:
64
- print(f"Error generating audio from caption: {e}")
65
  return None
 
 
 
 
 
66
 
 
67
  css = """
68
- #col-container{
 
69
  margin: 0 auto;
70
- max-width: 800px;
71
- }
 
 
 
72
  """
73
 
74
  with gr.Blocks(css=css) as demo:
75
  with gr.Column(elem_id="col-container"):
76
  gr.HTML("""
77
- <h1 style="text-align: center;">🎶 Generate Sound Effects from Image</h1>
78
- <p style="text-align: center;">
79
- ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
80
- </p>
81
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- gr.Markdown("""
84
- Welcome to this unique sound effect generator! This tool allows you to upload an image and generate a
85
- descriptive caption and a corresponding sound effect, all using free, open-source models on Hugging Face.
86
-
87
- **💡 How it works:**
88
- 1. **Upload an image**: Choose an image that you'd like to analyze.
89
- 2. **Generate Description**: Click on 'Generate Description' to get a textual description of your uploaded image.
90
- 3. **Generate Sound Effect**: Based on the image description, click on 'Generate Sound Effect' to create a
91
- sound effect that matches the image context.
92
-
93
- Enjoy the journey from visual to auditory sensation with just a few clicks!
94
- """)
95
-
96
- image_upload = gr.File(label="Upload Image", type="binary")
97
- generate_description_button = gr.Button("Generate Description")
98
- caption_display = gr.Textbox(label="Image Description", interactive=False)
99
- generate_sound_button = gr.Button("Generate Sound Effect")
100
- audio_output = gr.Audio(label="Generated Sound Effect")
101
-
102
- gr.Markdown("""
103
- ## 👥 How You Can Contribute
104
- We welcome contributions and suggestions for improvements. Your feedback is invaluable
105
- to the continuous enhancement of this application.
106
-
107
- For support, questions, or to contribute, please contact us at
108
109
-
110
- Support our work and get involved by donating through
111
- [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
112
- """)
113
-
114
- gr.Markdown("""
115
- ## 📢 Stay Connected
116
- This app is a testament to the creative possibilities that emerge when technology meets art.
117
- Enjoy exploring the auditory landscape of your images!
118
- """)
119
 
120
- def update_caption(image_file):
121
- description, _ = analyze_image_with_free_model(image_file)
122
- return description
 
 
 
123
 
124
- def generate_sound(description):
125
- if not description or description.startswith("Error"):
126
- return None
127
- audio_path = get_audioldm_from_caption(description)
128
- return audio_path
 
 
129
 
130
- generate_description_button.click(
131
- fn=update_caption,
132
- inputs=image_upload,
133
- outputs=caption_display
 
134
  )
135
 
136
- generate_sound_button.click(
137
- fn=generate_sound,
138
- inputs=caption_display,
139
- outputs=audio_output
140
  )
141
-
142
- gr.HTML('<a href="https://visitorbadge.io/status?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image"><img src="https://api.visitorbadge.io/api/visitors?path=https%3A%2F%2Fhuggingface.co%2Fspaces%2FBils%2FGenerate-Sound-Effects-from-Image&countColor=%23263759" /></a>')
143
- html = gr.HTML()
144
 
145
- demo.launch(debug=True, share=True)
 
 
1
+ import io
2
+ from pathlib import Path
3
+ from typing import Tuple, Optional
4
  import gradio as gr
5
+ import numpy as np
6
  import torch
7
+ from PIL import Image
8
+ from dotenv import load_dotenv
9
  from diffusers import DiffusionPipeline
10
  from transformers import pipeline
11
+ from huggingface_hub import login
12
 
13
+ # Load environment variables
14
  load_dotenv()
15
  hf_token = os.getenv("HF_TKN")
16
+ if hf_token:
17
+ login(token=hf_token)
18
+
19
+ # Device configuration
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
22
+
23
+ # Load models
24
+ @spaces.GPU
25
+ def load_models():
26
+ """Load both models with proper device placement"""
27
+ caption_pipe = pipeline(
28
+ "image-to-text",
29
+ model="nlpconnect/vit-gpt2-image-captioning",
30
+ device=device
31
+ )
32
 
33
+ audio_pipe = DiffusionPipeline.from_pretrained(
34
+ "cvssp/audioldm2",
35
+ token=hf_token,
36
+ torch_dtype=torch_dtype
37
+ )
38
+ return caption_pipe, audio_pipe
 
39
 
40
+ caption_pipe, audio_pipe = load_models()
 
 
 
41
 
42
+ def analyze_image(image_bytes: bytes) -> Tuple[str, bool]:
43
+ """Generate caption from image bytes with enhanced error handling"""
44
  try:
45
+ image = Image.open(io.BytesIO(image_bytes))
46
+ if image.mode != "RGB":
47
+ image = image.convert("RGB")
48
+
49
+ results = caption_pipe(image)
 
 
50
 
51
+ if not results or not isinstance(results, list):
52
+ return "Error: Invalid response from caption model", True
53
+
54
  caption = results[0].get("generated_text", "").strip()
55
+ return caption or "No caption generated", not bool(caption)
56
+
 
 
57
  except Exception as e:
58
+ return f"Image processing error: {str(e)}", True
59
 
60
  @spaces.GPU(duration=120)
61
+ def generate_audio(caption: str) -> Optional[Tuple[int, np.ndarray]]:
62
+ """Generate audio from caption with resource management"""
63
  try:
64
+ # Device management with context
65
+ original_device = next(audio_pipe.parameters()).device
66
+ audio_pipe.to(device)
67
+
68
+ # Generation with progress awareness
69
+ audio = audio_pipe(
70
  prompt=caption,
71
  num_inference_steps=50,
72
+ guidance_scale=7.5,
73
+ audio_length_in_s=5.0 # Keep audio generation short
74
+ ).audios[0]
75
+
76
+ # Post-processing
77
+ audio = audio.squeeze() # Handle mono channel
78
+ audio = np.clip(audio, -1, 1) # Ensure valid range
79
+ return (16000, audio)
80
+
81
  except Exception as e:
82
+ print(f"Audio generation error: {str(e)}")
83
  return None
84
+
85
+ finally:
86
+ audio_pipe.to(original_device)
87
+ if torch.cuda.is_available():
88
+ torch.cuda.empty_cache()
89
 
90
+ # UI Components
91
  css = """
92
+ #col-container {
93
+ max-width: 800px;
94
  margin: 0 auto;
95
+ }
96
+ .disclaimer {
97
+ font-size: 0.9em;
98
+ color: #666;
99
+ }
100
  """
101
 
102
  with gr.Blocks(css=css) as demo:
103
  with gr.Column(elem_id="col-container"):
104
  gr.HTML("""
105
+ <h1 style="text-align: center;">🎶 Image to Sound Effect Generator</h1>
106
+ <p style="text-align: center;">
107
+ ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
108
+ </p>
109
  """)
110
+
111
+ with gr.Row():
112
+ image_input = gr.Image(type="filepath", label="Upload Image")
113
+ caption_output = gr.Textbox(label="Generated Description", interactive=False)
114
+
115
+ with gr.Row():
116
+ generate_btn = gr.Button("Generate Description", variant="primary")
117
+ audio_output = gr.Audio(label="Generated Sound", interactive=False)
118
+ sound_btn = gr.Button("Generate Sound", variant="secondary")
119
+
120
+ gr.Examples(
121
+ examples=[str(Path(__file__).parent / "examples" / f) for f in ["storm.jpg", "city.jpg"]],
122
+ inputs=image_input,
123
+ outputs=[caption_output, audio_output],
124
+ fn=lambda x: (analyze_image(Path(x).read_bytes())[0], None),
125
+ cache_examples=True
126
+ )
127
 
128
+ gr.Markdown("### 🛠️ Usage Tips")
129
+ gr.Markdown("""
130
+ - Use clear, high-contrast images for best results
131
+ - Complex scenes may require multiple generations
132
+ - Keep sound generation under 10 seconds for quick results
133
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ gr.Markdown("### ⚠️ Disclaimer", elem_classes="disclaimer")
136
+ gr.Markdown("""
137
+ Generated content may not always be accurate. Use at your own discretion.
138
+ [Privacy Policy](https://bilsimaging.com/privacy) |
139
+ [Terms of Service](https://bilsimaging.com/terms)
140
+ """)
141
 
142
+ # Event handling
143
+ generate_btn.click(
144
+ fn=lambda x: analyze_image(Path(x).read_bytes())[0],
145
+ inputs=image_input,
146
+ outputs=caption_output,
147
+ api_name="describe"
148
+ )
149
 
150
+ sound_btn.click(
151
+ fn=generate_audio,
152
+ inputs=caption_output,
153
+ outputs=audio_output,
154
+ api_name="generate_sound"
155
  )
156
 
157
+ # Input validation
158
+ image_input.change(
159
+ fn=lambda: [gr.update(value=""), gr.update(value=None)],
160
+ outputs=[caption_output, audio_output]
161
  )
 
 
 
162
 
163
+ if __name__ == "__main__":
164
+ demo.launch(server_name="0.0.0.0" if os.getenv("SPACE_ID") else "127.0.0.1")