Bils's picture
Update app.py
213e5d3 verified
raw
history blame
4.43 kB
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
load_dotenv()
hf_token = os.getenv("HF_TKN")
device_id = 0 if torch.cuda.is_available() else -1
captioning_pipeline = pipeline(
"image-to-text",
model="nlpconnect/vit-gpt2-image-captioning",
device=device_id
)
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
use_auth_token=hf_token
)
@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file):
try:
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
temp_file.write(image_file)
temp_image_path = temp_file.name
results = captioning_pipeline(temp_image_path)
if not results or not isinstance(results, list):
return "Error: Could not generate caption.", True
caption = results[0].get("generated_text", "").strip()
if not caption:
return "No caption was generated.", True
return caption, False
except Exception as e:
return f"Error analyzing image: {e}", True
@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
try:
pipe.to("cuda")
audio_output = pipe(
prompt=caption,
num_inference_steps=50,
guidance_scale=7.5
)
pipe.to("cpu")
audio = audio_output.audios[0]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
write(temp_wav.name, 16000, audio)
return temp_wav.name
except Exception as e:
print(f"Error generating audio from caption: {e}")
return None
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo")
with gr.Column(scale=5):
gr.HTML("""
<div style="text-align: center; font-size: 32px; font-weight: bold; margin-bottom: 10px;">🎶 Image-to-Sound Generator</div>
<div style="text-align: center; font-size: 16px; color: #6c757d;">Transform your images into descriptive captions and immersive soundscapes.</div>
""")
with gr.Row():
with gr.Column():
gr.Markdown("""
### How It Works
1. **Upload an Image**: Select an image to analyze.
2. **Generate Description**: Get a detailed caption describing your image.
3. **Generate Sound**: Create an audio representation based on the caption.
""")
with gr.Row():
with gr.Column(scale=1):
image_upload = gr.File(label="Upload Image", type="binary")
generate_description_button = gr.Button("Generate Description", variant="primary")
with gr.Column(scale=2):
caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
generate_sound_button = gr.Button("Generate Sound", variant="primary")
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
with gr.Row():
gr.Markdown("""
## About This App
This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.
For inquiries, contact us at [contact@bilsimaging.com](mailto:contact@bilsimaging.com).
""")
def update_caption(image_file):
description, _ = analyze_image_with_free_model(image_file)
return description
def generate_sound(description):
if not description or description.startswith("Error"):
return None
audio_path = get_audioldm_from_caption(description)
return audio_path
generate_description_button.click(
fn=update_caption,
inputs=image_upload,
outputs=caption_display
)
generate_sound_button.click(
fn=generate_sound,
inputs=caption_display,
outputs=audio_output
)
demo.launch(debug=True, share=True)