Spaces:

Bils
/

Generate-Sound-Effects-from-Image

Runtime error

App Files Files Community

Bils commited on Jan 10

Commit

a6e39ab

verified ·

1 Parent(s): 9f4cca0

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -21

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import tempfile
 import gradio as gr
@@ -9,32 +10,32 @@ from transformers import pipeline
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
 captioning_pipeline = pipeline(
-    "image-to-text",
-    model="nlpconnect/vit-gpt2-image-captioning",
 )
-device = "cuda" if torch.cuda.is_available() else "cpu"
 pipe = DiffusionPipeline.from_pretrained(
-    "cvssp/audioldm2",
-    use_auth_token=hf_token
 )
-pipe = pipe.to(device)
 def analyze_image_with_free_model(image_file):
-    """
-    Analyzes an uploaded image using a free Hugging Face model for image captioning.
-    Returns: (caption_text, is_error_flag)
-    """
     try:
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
         results = captioning_pipeline(temp_image_path)
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
@@ -44,20 +45,19 @@ def analyze_image_with_free_model(image_file):
         return caption, False
     except Exception as e:
-        print(f"Error analyzing image: {e}")
         return f"Error analyzing image: {e}", True
 def get_audioldm_from_caption(caption):
-    """
-    Generates sound from a caption using the AudioLDM-2 model.
-    Returns the filename (path) of the generated .wav file.
-    """
     try:
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
         audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
@@ -82,7 +82,7 @@ with gr.Blocks(css=css) as demo:
            🎶 Generate Sound Effects from Image
         </h1>
          <p style="text-align: center;">
-          ⚡  Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
         </p>
         """)
@@ -101,11 +101,10 @@ with gr.Blocks(css=css) as demo:
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
-    caption_display = gr.Textbox(label="Image Description", interactive=False)  # Keep read-only
     generate_sound_button = gr.Button("Generate Sound Effect")
     audio_output = gr.Audio(label="Generated Sound Effect")
-    # Extra footer
     gr.Markdown("""
     ## 👥 How You Can Contribute
     We welcome contributions and suggestions for improvements. Your feedback is invaluable
@@ -125,12 +124,12 @@ with gr.Blocks(css=css) as demo:
     """)
     def update_caption(image_file):
-        description, error_flag = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
-            return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path

+import spaces
 import os
 import tempfile
 import gradio as gr
 from pathlib import Path
 load_dotenv()
 hf_token = os.getenv("HF_TKN")
+# Initialize pipelines globally (in CPU mode)
 captioning_pipeline = pipeline(
+    "image-to-text",
+    model="nlpconnect/vit-gpt2-image-captioning"
 )
 pipe = DiffusionPipeline.from_pretrained(
+    "cvssp/audioldm2",
+    use_auth_token=hf_token
 )
+@spaces.GPU(duration=120)
 def analyze_image_with_free_model(image_file):
     try:
+        # Move captioning pipeline to GPU
+        captioning_pipeline.to("cuda")
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
             temp_file.write(image_file)
             temp_image_path = temp_file.name
         results = captioning_pipeline(temp_image_path)
+        # Move back to CPU (optional)
+        captioning_pipeline.to("cpu")
         if not results or not isinstance(results, list):
             return "Error: Could not generate caption.", True
         return caption, False
     except Exception as e:
         return f"Error analyzing image: {e}", True
+@spaces.GPU(duration=120)
 def get_audioldm_from_caption(caption):
     try:
+        # Move AudioLDM pipeline to GPU
+        pipe.to("cuda")
         audio_output = pipe(
             prompt=caption,
             num_inference_steps=50,
             guidance_scale=7.5
         )
+        pipe.to("cpu")
         audio = audio_output.audios[0]
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
            🎶 Generate Sound Effects from Image
         </h1>
          <p style="text-align: center;">
+          ⚡ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
         </p>
         """)
     image_upload = gr.File(label="Upload Image", type="binary")
     generate_description_button = gr.Button("Generate Description")
+    caption_display = gr.Textbox(label="Image Description", interactive=False)
     generate_sound_button = gr.Button("Generate Sound Effect")
     audio_output = gr.Audio(label="Generated Sound Effect")
     gr.Markdown("""
     ## 👥 How You Can Contribute
     We welcome contributions and suggestions for improvements. Your feedback is invaluable
     """)
     def update_caption(image_file):
+        description, _ = analyze_image_with_free_model(image_file)
         return description
     def generate_sound(description):
         if not description or description.startswith("Error"):
+            return None
         audio_path = get_audioldm_from_caption(description)
         return audio_path