Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import os | |
import tempfile | |
import gradio as gr | |
from dotenv import load_dotenv | |
import torch | |
from scipy.io.wavfile import write | |
from diffusers import DiffusionPipeline | |
from transformers import pipeline | |
from pathlib import Path | |
load_dotenv() | |
hf_token = os.getenv("HF_TKN") | |
device_id = 0 if torch.cuda.is_available() else -1 | |
captioning_pipeline = pipeline( | |
"image-to-text", | |
model="nlpconnect/vit-gpt2-image-captioning", | |
device=device_id | |
) | |
pipe = DiffusionPipeline.from_pretrained( | |
"cvssp/audioldm2", | |
use_auth_token=hf_token | |
) | |
def analyze_image_with_free_model(image_file): | |
try: | |
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: | |
temp_file.write(image_file) | |
temp_image_path = temp_file.name | |
results = captioning_pipeline(temp_image_path) | |
if not results or not isinstance(results, list): | |
return "Error: Could not generate caption.", True | |
caption = results[0].get("generated_text", "").strip() | |
if not caption: | |
return "No caption was generated.", True | |
return caption, False | |
except Exception as e: | |
return f"Error analyzing image: {e}", True | |
def get_audioldm_from_caption(caption): | |
try: | |
pipe.to("cuda") | |
audio_output = pipe( | |
prompt=caption, | |
num_inference_steps=50, | |
guidance_scale=7.5 | |
) | |
pipe.to("cpu") | |
audio = audio_output.audios[0] | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: | |
write(temp_wav.name, 16000, audio) | |
return temp_wav.name | |
except Exception as e: | |
print(f"Error generating audio from caption: {e}") | |
return None | |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo") | |
with gr.Column(scale=5): | |
gr.HTML(""" | |
<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 10px;">🎶 Image-to-Sound Generator</div> | |
<div style="text-align: center; font-size: 16px; color: #6c757d;">Transform your images into descriptive captions and immersive soundscapes.</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
gr.Markdown(""" | |
### How It Works | |
1. **Upload an Image**: Select an image to analyze. | |
2. **Generate Description**: Get a detailed caption describing your image. | |
3. **Generate Sound**: Create an audio representation based on the caption. | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
image_upload = gr.File(label="Upload Image", type="binary") | |
generate_description_button = gr.Button("Generate Description", variant="primary") | |
with gr.Column(scale=2): | |
caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.") | |
generate_sound_button = gr.Button("Generate Sound", variant="primary") | |
with gr.Column(scale=1): | |
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False) | |
with gr.Row(): | |
gr.Markdown(""" | |
## About This App | |
This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology. | |
For inquiries, contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com). | |
""") | |
def update_caption(image_file): | |
description, _ = analyze_image_with_free_model(image_file) | |
return description | |
def generate_sound(description): | |
if not description or description.startswith("Error"): | |
return None | |
audio_path = get_audioldm_from_caption(description) | |
return audio_path | |
generate_description_button.click( | |
fn=update_caption, | |
inputs=image_upload, | |
outputs=caption_display | |
) | |
generate_sound_button.click( | |
fn=generate_sound, | |
inputs=caption_display, | |
outputs=audio_output | |
) | |
demo.launch(debug=True, share=True) | |