Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

App Files Files Community

Generate-Sound-Effects-from-Image / app.py

Bils

Update app.py

213e5d3 verified 6 months ago

raw

history blame

4.43 kB

	import spaces
	import os
	import tempfile
	import gradio as gr
	from dotenv import load_dotenv
	import torch
	from scipy.io.wavfile import write
	from diffusers import DiffusionPipeline
	from transformers import pipeline
	from pathlib import Path

	load_dotenv()
	hf_token = os.getenv("HF_TKN")

	device_id = 0 if torch.cuda.is_available() else -1

	captioning_pipeline = pipeline(
	"image-to-text",
	model="nlpconnect/vit-gpt2-image-captioning",
	device=device_id
	)

	pipe = DiffusionPipeline.from_pretrained(
	"cvssp/audioldm2",
	use_auth_token=hf_token
	)

	@spaces.GPU(duration=120)
	def analyze_image_with_free_model(image_file):
	try:
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
	temp_file.write(image_file)
	temp_image_path = temp_file.name

	results = captioning_pipeline(temp_image_path)
	if not results or not isinstance(results, list):
	return "Error: Could not generate caption.", True

	caption = results[0].get("generated_text", "").strip()
	if not caption:
	return "No caption was generated.", True
	return caption, False

	except Exception as e:
	return f"Error analyzing image: {e}", True

	@spaces.GPU(duration=120)
	def get_audioldm_from_caption(caption):
	try:
	pipe.to("cuda")
	audio_output = pipe(
	prompt=caption,
	num_inference_steps=50,
	guidance_scale=7.5
	)
	pipe.to("cpu")
	audio = audio_output.audios[0]

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
	write(temp_wav.name, 16000, audio)
	return temp_wav.name

	except Exception as e:
	print(f"Error generating audio from caption: {e}")
	return None

	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
	with gr.Row():
	with gr.Column(scale=1):
	gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo")
	with gr.Column(scale=5):
	gr.HTML("""
	<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 10px;">🎶 Image-to-Sound Generator</div>
	<div style="text-align: center; font-size: 16px; color: #6c757d;">Transform your images into descriptive captions and immersive soundscapes.</div>
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("""
	### How It Works
	1. Upload an Image: Select an image to analyze.
	2. Generate Description: Get a detailed caption describing your image.
	3. Generate Sound: Create an audio representation based on the caption.
	""")

	with gr.Row():
	with gr.Column(scale=1):
	image_upload = gr.File(label="Upload Image", type="binary")
	generate_description_button = gr.Button("Generate Description", variant="primary")
	with gr.Column(scale=2):
	caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
	generate_sound_button = gr.Button("Generate Sound", variant="primary")
	with gr.Column(scale=1):
	audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)

	with gr.Row():
	gr.Markdown("""
	## About This App
	This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.

	For inquiries, contact us at [[email protected]](mailto:[email protected]).
	""")

	def update_caption(image_file):
	description, _ = analyze_image_with_free_model(image_file)
	return description

	def generate_sound(description):
	if not description or description.startswith("Error"):
	return None
	audio_path = get_audioldm_from_caption(description)
	return audio_path

	generate_description_button.click(
	fn=update_caption,
	inputs=image_upload,
	outputs=caption_display
	)

	generate_sound_button.click(
	fn=generate_sound,
	inputs=caption_display,
	outputs=audio_output
	)

	demo.launch(debug=True, share=True)