Spaces:

ANASAKHTAR
/

Image_Captions_With_Audio

Running

App Files Files Community

Muhammad Anas Akhtar commited on Dec 7, 2024

Commit

c1309e4

verified ·

1 Parent(s): 83ed3dd

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -16

app.py CHANGED Viewed

@@ -3,23 +3,28 @@ import gradio as gr
 from PIL import Image
 import numpy as np
 import os
-# Use a pipeline as a high-level helper
-from transformers import pipeline
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
-# Initialize the pipelines
 caption_image = pipeline("image-to-text",
                        model="Salesforce/blip-image-captioning-large",
                        device=device)
-# Using a different TTS model that's more stable
-narrator = pipeline("text-to-speech",
-                   model="microsoft/speecht5_tts",
-                   device=device)
 def ensure_output_dir():
     """Ensure the output directory exists"""
@@ -32,16 +37,12 @@ def generate_audio(text):
     Generate audio from text and save it
     """
     try:
-        # Generate the speech
-        speech = narrator(text)
         # Create output directory and file path
         output_dir = ensure_output_dir()
         output_path = os.path.join(output_dir, "caption_audio.wav")
-        # Save the audio file
-        with open(output_path, "wb") as f:
-            f.write(speech["audio"])
         return output_path
     except Exception as e:
@@ -82,7 +83,7 @@ demo = gr.Interface(
         gr.Audio(label="Generated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
-    title="Image Captioning with Audio",
     description="""
     Upload an image and the application will:
     1. Generate a descriptive caption for the image
@@ -93,4 +94,4 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    demo.launch()

 from PIL import Image
 import numpy as np
 import os
+from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
+import scipy.io.wavfile as wavfile
 # Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
+# Initialize the image captioning pipeline
 caption_image = pipeline("image-to-text",
                        model="Salesforce/blip-image-captioning-large",
                        device=device)
+# Initialize TTS with Coqui TTS
+try:
+    from TTS.api import TTS
+    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
+except ImportError:
+    print("Installing TTS...")
+    import subprocess
+    subprocess.check_call(["pip", "install", "TTS"])
+    from TTS.api import TTS
+    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 def ensure_output_dir():
     """Ensure the output directory exists"""
     Generate audio from text and save it
     """
     try:
         # Create output directory and file path
         output_dir = ensure_output_dir()
         output_path = os.path.join(output_dir, "caption_audio.wav")
+        # Generate speech using Coqui TTS
+        tts.tts_to_file(text=text, file_path=output_path)
         return output_path
     except Exception as e:
         gr.Audio(label="Generated Audio"),
         gr.Textbox(label="Generated Caption")
     ],
+    title="@GenAILearniverse Project: Image Captioning with Audio",
     description="""
     Upload an image and the application will:
     1. Generate a descriptive caption for the image
 )
 if __name__ == "__main__":
+    demo.launch()