import spaces import os import tempfile import gradio as gr from dotenv import load_dotenv import torch from scipy.io.wavfile import write from diffusers import DiffusionPipeline from transformers import pipeline from pathlib import Path load_dotenv() hf_token = os.getenv("HF_TKN") device_id = 0 if torch.cuda.is_available() else -1 captioning_pipeline = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device_id ) pipe = DiffusionPipeline.from_pretrained( "cvssp/audioldm2", use_auth_token=hf_token ) @spaces.GPU(duration=120) def analyze_image_with_free_model(image_file): try: with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: temp_file.write(image_file) temp_image_path = temp_file.name results = captioning_pipeline(temp_image_path) if not results or not isinstance(results, list): return "Error: Could not generate caption.", True caption = results[0].get("generated_text", "").strip() if not caption: return "No caption was generated.", True return caption, False except Exception as e: return f"Error analyzing image: {e}", True @spaces.GPU(duration=120) def get_audioldm_from_caption(caption): try: pipe.to("cuda") audio_output = pipe( prompt=caption, num_inference_steps=50, guidance_scale=7.5 ) pipe.to("cpu") audio = audio_output.audios[0] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: write(temp_wav.name, 16000, audio) return temp_wav.name except Exception as e: print(f"Error generating audio from caption: {e}") return None with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: with gr.Row(): with gr.Column(scale=1): gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo") with gr.Column(scale=5): gr.HTML("""
🎶 Image-to-Sound Generator
Transform your images into descriptive captions and immersive soundscapes.
""") with gr.Row(): with gr.Column(): gr.Markdown(""" ### How It Works 1. **Upload an Image**: Select an image to analyze. 2. **Generate Description**: Get a detailed caption describing your image. 3. **Generate Sound**: Create an audio representation based on the caption. """) with gr.Row(): with gr.Column(scale=1): image_upload = gr.File(label="Upload Image", type="binary") generate_description_button = gr.Button("Generate Description", variant="primary") with gr.Column(scale=2): caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.") generate_sound_button = gr.Button("Generate Sound", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) with gr.Row(): gr.Markdown(""" ## About This App This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology. For inquiries, contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). """) def update_caption(image_file): description, _ = analyze_image_with_free_model(image_file) return description def generate_sound(description): if not description or description.startswith("Error"): return None audio_path = get_audioldm_from_caption(description) return audio_path generate_description_button.click( fn=update_caption, inputs=image_upload, outputs=caption_display ) generate_sound_button.click( fn=generate_sound, inputs=caption_display, outputs=audio_output ) demo.launch(debug=True, share=True)