Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import os | |
import tempfile | |
import torch | |
import numpy as np | |
from scipy.io.wavfile import write | |
from dotenv import load_dotenv | |
from diffusers import DiffusionPipeline | |
from transformers import pipeline | |
from PIL import Image | |
import io | |
from pydub import AudioSegment | |
from typing import List | |
import spaces | |
# Load environment variables | |
load_dotenv() | |
HF_TOKEN = os.getenv("HF_TKN") | |
# Device configuration | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
# Initialize models | |
def load_caption_model(): | |
return pipeline( | |
"image-to-text", | |
model="Salesforce/blip-image-captioning-base", | |
device=device | |
) | |
def load_audio_model(): | |
pipe = DiffusionPipeline.from_pretrained( | |
"cvssp/audioldm2", | |
use_auth_token=HF_TOKEN | |
) | |
return pipe | |
caption_pipe = load_caption_model() | |
audio_pipe = load_audio_model().to(device) | |
def analyze_image(image_file): | |
"""Generate caption from image with validation""" | |
try: | |
# Validate image | |
try: | |
image = Image.open(io.BytesIO(image_file)) | |
image.verify() | |
image = Image.open(io.BytesIO(image_file)) | |
except Exception as e: | |
raise ValueError(f"Invalid image file: {str(e)}") | |
results = caption_pipe(image) | |
if not results or not isinstance(results, list): | |
raise RuntimeError("No caption generated") | |
caption = results[0].get("generated_text", "").strip() | |
if not caption: | |
raise RuntimeError("Empty caption generated") | |
return caption | |
except Exception as e: | |
raise gr.Error(f"Image processing error: {str(e)}") | |
def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5): | |
"""Generate audio from single prompt""" | |
try: | |
if not prompt or len(prompt) < 10: | |
raise ValueError("Prompt must be at least 10 characters") | |
with torch.inference_mode(): | |
audio = audio_pipe( | |
prompt=prompt, | |
num_inference_steps=int(num_steps), | |
guidance_scale=guidance_scale, | |
audio_length_in_s=10 | |
).audios[0] | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: | |
write(tmpfile.name, 16000, audio) | |
return tmpfile.name | |
except Exception as e: | |
raise gr.Error(f"Audio generation error: {str(e)}") | |
def blend_audios(audio_files: List[str]) -> str: | |
"""Mix multiple audio files into one""" | |
try: | |
if not audio_files: | |
raise ValueError("No audio files to blend") | |
# Load first audio to get base parameters | |
base_audio = AudioSegment.from_wav(audio_files[0]) | |
mixed = base_audio | |
# Mix subsequent tracks | |
for file in audio_files[1:]: | |
track = AudioSegment.from_wav(file) | |
if len(track) > len(mixed): | |
mixed = mixed.overlay(track[:len(mixed)]) | |
else: | |
mixed = mixed.overlay(track) | |
# Export mixed audio | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile: | |
mixed.export(tmpfile.name, format="wav") | |
return tmpfile.name | |
except Exception as e: | |
raise gr.Error(f"Audio mixing error: {str(e)}") | |
def process_inputs(input_choice, image_file, *prompts): | |
"""Handle both image and text input modes""" | |
try: | |
# Filter empty prompts | |
valid_prompts = [p.strip() for p in prompts if p.strip()] | |
if input_choice == "Image": | |
if not image_file: | |
raise gr.Error("Please upload an image") | |
main_prompt = analyze_image(image_file) | |
valid_prompts = [main_prompt] + valid_prompts | |
else: | |
if not valid_prompts: | |
raise gr.Error("Please enter at least one text prompt") | |
# Generate audio for each prompt | |
audio_files = [] | |
for idx, prompt in enumerate(valid_prompts): | |
audio_path = generate_audio(prompt) | |
audio_files.append(audio_path) | |
# Blend all audio files | |
final_audio = blend_audios(audio_files) | |
return valid_prompts, final_audio, audio_files | |
except Exception as e: | |
raise gr.Error(str(e)) | |
# Gradio interface | |
css = """ | |
#main-container { max-width: 800px; margin: 0 auto; } | |
.dark { background: #1a1a1a; } | |
.prompt-box { margin-bottom: 10px; } | |
.audio-track { margin: 5px 0; } | |
""" | |
with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app: | |
with gr.Column(elem_id="main-container"): | |
gr.Markdown(""" | |
# 🎨 Image to Sound Generator | |
Transform visual content or text prompts into mixed sound effects! | |
""") | |
# Input Mode Selector | |
input_choice = gr.Radio( | |
choices=["Image", "Text"], | |
value="Image", | |
label="Input Mode", | |
interactive=True | |
) | |
# Image Input Section | |
with gr.Row(visible=True) as image_row: | |
image_input = gr.Image(type="filepath", label="Upload Image") | |
# Text Input Section | |
with gr.Column(visible=False) as text_inputs_col: | |
prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)] | |
add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary") | |
# Dynamic prompt management | |
current_prompts = gr.State(value=3) | |
def add_prompt(current_count): | |
new_count = current_count + 1 | |
new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True) | |
return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count) | |
add_prompt_btn.click( | |
fn=add_prompt, | |
inputs=current_prompts, | |
outputs=[current_prompts] + prompt_components + [text_inputs_col] | |
) | |
# Toggle between image/text inputs | |
def toggle_inputs(choice): | |
if choice == "Image": | |
return [gr.update(visible=True), gr.update(visible=False)] | |
return [gr.update(visible=False), gr.update(visible=True)] | |
input_choice.change( | |
fn=toggle_inputs, | |
inputs=input_choice, | |
outputs=[image_row, text_inputs_col] | |
) | |
# Generation Controls | |
with gr.Accordion("Advanced Settings", open=False): | |
steps_slider = gr.Slider(10, 200, 100, label="Generation Steps") | |
guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale") | |
generate_btn = gr.Button("Generate Mixed Sound", variant="primary") | |
# Outputs | |
with gr.Column(): | |
gr.Markdown("### Generation Results") | |
prompt_display = gr.JSON(label="Used Prompts") | |
final_audio = gr.Audio(label="Blended Sound Effect", interactive=False) | |
with gr.Accordion("Individual Tracks", open=False): | |
track_components = [gr.Audio(visible=False) for _ in range(5)] | |
# Examples | |
gr.Examples( | |
examples=[ | |
["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"], | |
[None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"] | |
], | |
inputs=[image_input] + prompt_components[:2], | |
outputs=[prompt_display, final_audio], | |
fn=lambda *x: process_inputs("Image", *x), | |
cache_examples=True | |
) | |
# Contribution Section | |
with gr.Column(): | |
gr.Markdown(""" | |
## 👥 How You Can Contribute | |
We welcome contributions! Contact us at [[email protected]](mailto:[email protected]). | |
Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua | |
""") | |
gr.HTML(""" | |
<div style="text-align: center;"> | |
<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image"> | |
<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" /> | |
</a> | |
</div> | |
""") | |
# Footer | |
gr.Markdown(""" | |
--- | |
[GitHub Repository](https://github.com/bilsimaging/Imaginesound)* | |
""") | |
# Event handling | |
generate_btn.click( | |
fn=process_inputs, | |
inputs=[input_choice, image_input] + prompt_components, | |
outputs=[prompt_display, final_audio, *track_components] | |
) | |
if __name__ == "__main__": | |
app.launch(debug=True, share=True) |