import spaces import os import tempfile import gradio as gr from dotenv import load_dotenv import torch from scipy.io.wavfile import write from diffusers import DiffusionPipeline from transformers import pipeline from pathlib import Path load_dotenv() hf_token = os.getenv("HF_TKN") device_id = 0 if torch.cuda.is_available() else -1 captioning_pipeline = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device_id ) pipe = DiffusionPipeline.from_pretrained( "cvssp/audioldm2", use_auth_token=hf_token ) @spaces.GPU(duration=120) def analyze_image_with_free_model(image_file): try: with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file: temp_file.write(image_file) temp_image_path = temp_file.name results = captioning_pipeline(temp_image_path) if not results or not isinstance(results, list): return "Error: Could not generate caption.", True caption = results[0].get("generated_text", "").strip() if not caption: return "No caption was generated.", True return caption, False except Exception as e: return f"Error analyzing image: {e}", True @spaces.GPU(duration=120) def get_audioldm_from_caption(caption): try: pipe.to("cuda") audio_output = pipe( prompt=caption, num_inference_steps=50, guidance_scale=7.5 ) pipe.to("cpu") audio = audio_output.audios[0] with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav: write(temp_wav.name, 16000, audio) return temp_wav.name except Exception as e: print(f"Error generating audio from caption: {e}") return None with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: with gr.Row(): with gr.Column(scale=1): gr.Image(value="https://via.placeholder.com/150", interactive=False, label="App Logo", elem_id="app-logo") with gr.Column(scale=5): gr.HTML("""