Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,088 Bytes
a6e39ab 81b2481 6b4c086 24da5c3 81b2481 6b4c086 24da5c3 a6e39ab 6b4c086 24da5c3 a6e39ab 24da5c3 a6e39ab 24da5c3 81b2481 24da5c3 81b2481 24da5c3 81b2481 24da5c3 81b2481 a6e39ab 81b2481 24da5c3 a6e39ab 24da5c3 a6e39ab 24da5c3 6b4c086 24da5c3 81b2481 24da5c3 81b2481 24da5c3 e33e34e 3e99989 e33e34e 3e99989 e33e34e 81b2481 e33e34e 3e99989 e33e34e 3e99989 e33e34e 3e99989 e33e34e 3e99989 e33e34e 3e99989 e33e34e 81b2481 e33e34e 81b2481 e33e34e 24da5c3 3e99989 e33e34e 3e99989 e33e34e 24da5c3 81b2481 a6e39ab 81b2481 24da5c3 a6e39ab 81b2481 24da5c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import spaces
import os
import tempfile
import gradio as gr
from dotenv import load_dotenv
import torch
from scipy.io.wavfile import write
from diffusers import DiffusionPipeline
from transformers import pipeline
from pathlib import Path
load_dotenv()
hf_token = os.getenv("HF_TKN")
device_id = 0 if torch.cuda.is_available() else -1
captioning_pipeline = pipeline(
"image-to-text",
model="nlpconnect/vit-gpt2-image-captioning",
device=device_id
)
pipe = DiffusionPipeline.from_pretrained(
"cvssp/audioldm2",
use_auth_token=hf_token
)
@spaces.GPU(duration=120)
def analyze_image_with_free_model(image_file):
try:
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
temp_file.write(image_file)
temp_image_path = temp_file.name
results = captioning_pipeline(temp_image_path)
if not results or not isinstance(results, list):
return "Error: Could not generate caption.", True
caption = results[0].get("generated_text", "").strip()
if not caption:
return "No caption was generated.", True
return caption, False
except Exception as e:
return f"Error analyzing image: {e}", True
@spaces.GPU(duration=120)
def get_audioldm_from_caption(caption):
try:
pipe.to("cuda")
audio_output = pipe(
prompt=caption,
num_inference_steps=50,
guidance_scale=7.5
)
pipe.to("cpu")
audio = audio_output.audios[0]
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
write(temp_wav.name, 16000, audio)
return temp_wav.name
except Exception as e:
print(f"Error generating audio from caption: {e}")
return None
css = """
#header-container {
text-align: center;
margin: 20px 0;
}
#header-title {
font-size: 36px;
font-weight: bold;
margin-bottom: 10px;
color: #333333;
}
#header-subtitle {
font-size: 18px;
margin-bottom: 20px;
color: #555555;
}
#main-container {
max-width: 900px;
margin: 0 auto;
padding: 20px;
border-radius: 12px;
background: linear-gradient(135deg, #ffffff, #f0f0f0);
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
}
button.primary-button {
background: linear-gradient(90deg, #007bff, #0056b3);
color: white;
border: none;
padding: 12px 24px;
border-radius: 8px;
font-size: 16px;
cursor: pointer;
font-weight: bold;
transition: transform 0.2s, box-shadow 0.2s;
}
button.primary-button:hover {
transform: translateY(-3px);
box-shadow: 0 8px 15px rgba(0, 0, 0, 0.2);
}
button.primary-button:active {
transform: translateY(0);
box-shadow: none;
}
#footer-container {
margin-top: 30px;
text-align: center;
color: #666666;
font-size: 14px;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="header-container"):
gr.HTML("""
<div id="header-title">🎶 Image-to-Sound Generator</div>
<div id="header-subtitle">Transform your images into descriptive captions and immersive soundscapes.</div>
""")
with gr.Box(elem_id="main-container"):
gr.Markdown("""
### How It Works
1. **Upload an Image**: Select an image to analyze.
2. **Generate Description**: Get a detailed caption describing your image.
3. **Generate Sound**: Create an audio representation based on the caption.
""")
image_upload = gr.File(label="Upload Image", type="binary")
generate_description_button = gr.Button("Generate Description", elem_classes="primary-button")
caption_display = gr.Textbox(label="Generated Caption", interactive=False, placeholder="Your image caption will appear here.")
generate_sound_button = gr.Button("Generate Sound", elem_classes="primary-button")
audio_output = gr.Audio(label="Generated Sound Effect", interactive=False)
with gr.Box(elem_id="footer-container"):
gr.Markdown("""
## About This App
This application uses advanced machine learning models to transform images into text captions and generate matching sound effects. It's a unique blend of visual and auditory creativity, powered by state-of-the-art AI technology.
For inquiries, contact us at [[email protected]](mailto:[email protected]).
""")
def update_caption(image_file):
description, _ = analyze_image_with_free_model(image_file)
return description
def generate_sound(description):
if not description or description.startswith("Error"):
return None
audio_path = get_audioldm_from_caption(description)
return audio_path
generate_description_button.click(
fn=update_caption,
inputs=image_upload,
outputs=caption_display
)
generate_sound_button.click(
fn=generate_sound,
inputs=caption_display,
outputs=audio_output
)
demo.launch(debug=True, share=True)
|