Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

App Files Files Community

Generate-Sound-Effects-from-Image / app.py

Bils

Update app.py

4d9e689 verified 5 months ago

raw

history blame

9 kB

	import gradio as gr
	import os
	import tempfile
	import torch
	import numpy as np
	from scipy.io.wavfile import write
	from dotenv import load_dotenv
	from diffusers import DiffusionPipeline
	from transformers import pipeline
	from PIL import Image
	import io
	from pydub import AudioSegment
	from typing import List
	from huggingface_hub import spaces

	# Load environment variables
	load_dotenv()
	HF_TOKEN = os.getenv("HF_TKN")

	# Device configuration
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize models
	@gr.cache()
	def load_caption_model():
	return pipeline(
	"image-to-text",
	model="Salesforce/blip-image-captioning-base",
	device=device
	)

	@gr.cache()
	def load_audio_model():
	pipe = DiffusionPipeline.from_pretrained(
	"cvssp/audioldm2",
	use_auth_token=HF_TOKEN
	)
	return pipe

	caption_pipe = load_caption_model()
	audio_pipe = load_audio_model().to(device)

	@spaces.GPU(duration=120)
	def analyze_image(image_file):
	"""Generate caption from image with validation"""
	try:
	# Validate image
	try:
	image = Image.open(io.BytesIO(image_file))
	image.verify()
	image = Image.open(io.BytesIO(image_file))
	except Exception as e:
	raise ValueError(f"Invalid image file: {str(e)}")

	results = caption_pipe(image)
	if not results or not isinstance(results, list):
	raise RuntimeError("No caption generated")

	caption = results[0].get("generated_text", "").strip()
	if not caption:
	raise RuntimeError("Empty caption generated")

	return caption

	except Exception as e:
	raise gr.Error(f"Image processing error: {str(e)}")

	@spaces.GPU(duration=120)
	def generate_audio(prompt: str, num_steps=100, guidance_scale=7.5):
	"""Generate audio from single prompt"""
	try:
	if not prompt or len(prompt) < 10:
	raise ValueError("Prompt must be at least 10 characters")

	with torch.inference_mode():
	audio = audio_pipe(
	prompt=prompt,
	num_inference_steps=int(num_steps),
	guidance_scale=guidance_scale,
	audio_length_in_s=10
	).audios[0]

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	write(tmpfile.name, 16000, audio)
	return tmpfile.name

	except Exception as e:
	raise gr.Error(f"Audio generation error: {str(e)}")

	@spaces.GPU(duration=120)
	def blend_audios(audio_files: List[str]) -> str:
	"""Mix multiple audio files into one"""
	try:
	if not audio_files:
	raise ValueError("No audio files to blend")

	# Load first audio to get base parameters
	base_audio = AudioSegment.from_wav(audio_files[0])
	mixed = base_audio

	# Mix subsequent tracks
	for file in audio_files[1:]:
	track = AudioSegment.from_wav(file)
	if len(track) > len(mixed):
	mixed = mixed.overlay(track[:len(mixed)])
	else:
	mixed = mixed.overlay(track)

	# Export mixed audio
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmpfile:
	mixed.export(tmpfile.name, format="wav")
	return tmpfile.name

	except Exception as e:
	raise gr.Error(f"Audio mixing error: {str(e)}")

	def process_inputs(input_choice, image_file, *prompts):
	"""Handle both image and text input modes"""
	try:
	# Filter empty prompts
	valid_prompts = [p.strip() for p in prompts if p.strip()]

	if input_choice == "Image":
	if not image_file:
	raise gr.Error("Please upload an image")
	main_prompt = analyze_image(image_file)
	valid_prompts = [main_prompt] + valid_prompts
	else:
	if not valid_prompts:
	raise gr.Error("Please enter at least one text prompt")

	# Generate audio for each prompt
	audio_files = []
	for idx, prompt in enumerate(valid_prompts):
	audio_path = generate_audio(prompt)
	audio_files.append(audio_path)

	# Blend all audio files
	final_audio = blend_audios(audio_files)
	return valid_prompts, final_audio, audio_files

	except Exception as e:
	raise gr.Error(str(e))

	# Gradio interface
	css = """
	#main-container { max-width: 800px; margin: 0 auto; }
	.dark { background: #1a1a1a; }
	.prompt-box { margin-bottom: 10px; }
	.audio-track { margin: 5px 0; }
	"""

	with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="emerald")) as app:
	with gr.Column(elem_id="main-container"):
	gr.Markdown("""
	# 🎨 Image to Sound Generator
	Transform visual content or text prompts into mixed sound effects!
	""")

	# Input Mode Selector
	input_choice = gr.Radio(
	choices=["Image", "Text"],
	value="Image",
	label="Input Mode",
	interactive=True
	)

	# Image Input Section
	with gr.Row(visible=True) as image_row:
	image_input = gr.Image(type="filepath", label="Upload Image")

	# Text Input Section
	with gr.Column(visible=False) as text_inputs_col:
	prompt_components = [gr.Textbox(label=f"Sound Effect {i+1}", lines=2) for i in range(3)]
	add_prompt_btn = gr.Button("Add Another Prompt", variant="secondary")

	# Dynamic prompt management
	current_prompts = gr.State(value=3)

	def add_prompt(current_count):
	new_count = current_count + 1
	new_prompt = gr.Textbox(label=f"Sound Effect {new_count}", lines=2, visible=True)
	return [new_count] + [new_prompt] + [gr.update(visible=True)]*(new_count)

	add_prompt_btn.click(
	fn=add_prompt,
	inputs=current_prompts,
	outputs=[current_prompts] + prompt_components + [text_inputs_col]
	)

	# Toggle between image/text inputs
	def toggle_inputs(choice):
	if choice == "Image":
	return [gr.update(visible=True), gr.update(visible=False)]
	return [gr.update(visible=False), gr.update(visible=True)]

	input_choice.change(
	fn=toggle_inputs,
	inputs=input_choice,
	outputs=[image_row, text_inputs_col]
	)

	# Generation Controls
	with gr.Accordion("Advanced Settings", open=False):
	steps_slider = gr.Slider(10, 200, 100, label="Generation Steps")
	guidance_slider = gr.Slider(1.0, 15.0, 7.5, label="Guidance Scale")

	generate_btn = gr.Button("Generate Mixed Sound", variant="primary")

	# Outputs
	with gr.Column():
	gr.Markdown("### Generation Results")
	prompt_display = gr.JSON(label="Used Prompts")
	final_audio = gr.Audio(label="Blended Sound Effect", interactive=False)

	with gr.Accordion("Individual Tracks", open=False):
	track_components = [gr.Audio(visible=False) for _ in range(5)]

	# Examples
	gr.Examples(
	examples=[
	["examples/storm.jpg", "A dramatic thunderstorm", "Heavy rain pouring", "Distant rumble"],
	[None, "Clock ticking", "Crowd murmuring", "Footsteps on concrete"]
	],
	inputs=[image_input] + prompt_components[:2],
	outputs=[prompt_display, final_audio],
	fn=lambda x: process_inputs("Image", x),
	cache_examples=True
	)

	# Contribution Section
	with gr.Column():
	gr.Markdown("""
	## 👥 How You Can Contribute
	We welcome contributions! Contact us at [[email protected]](mailto:[email protected]).
	Support us on [Ko-fi](https://ko-fi.com/bilsimaging). - Bilel Aroua
	""")
	gr.HTML("""
	<div style="text-align: center;">
	<a href="https://visitorbadge.io/status?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image">
	<img src="https://api.visitorbadge.io/api/visitors?path=https://huggingface.co/spaces/Bils/Generate-Sound-Effects-from-Image&countColor=%23263759" />
	</a>
	</div>
	""")

	# Footer
	gr.Markdown("""
	---
	[GitHub Repository](https://github.com/bilsimaging/Imaginesound)*
	""")

	# Event handling
	generate_btn.click(
	fn=process_inputs,
	inputs=[input_choice, image_input] + prompt_components,
	outputs=[prompt_display, final_audio, *track_components]
	)

	if __name__ == "__main__":
	app.launch(debug=True, share=True)