Spaces:

ANASAKHTAR
/

Image_Captions_With_Audio

Running

App Files Files Community

Muhammad Anas Akhtar commited on Dec 8, 2024

Commit

4d3aad6

verified ·

1 Parent(s): e56a316

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -82

app.py CHANGED Viewed

@@ -1,97 +1,51 @@
 import torch
 import gradio as gr
 from PIL import Image
-import numpy as np
-import os
-from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
 import scipy.io.wavfile as wavfile
-# Set device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-# Initialize the image captioning pipeline
 caption_image = pipeline("image-to-text",
-                       model="Salesforce/blip-image-captioning-large",
-                       device=device)
-# Initialize TTS with Coqui TTS
-try:
-    from TTS.api import TTS
-    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
-except ImportError:
-    print("Installing TTS...")
-    import subprocess
-    subprocess.check_call(["pip", "install", "TTS"])
-    from TTS.api import TTS
-    tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
-def ensure_output_dir():
-    """Ensure the output directory exists"""
-    output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
-    os.makedirs(output_dir, exist_ok=True)
-    return output_dir
 def generate_audio(text):
-    """
-    Generate audio from text and save it
-    """
-    try:
-        # Create output directory and file path
-        output_dir = ensure_output_dir()
-        output_path = os.path.join(output_dir, "caption_audio.wav")
-        # Generate speech using Coqui TTS
-        tts.tts_to_file(text=text, file_path=output_path)
-        return output_path
-    except Exception as e:
-        print(f"Error generating audio: {str(e)}")
-        raise gr.Error(f"Failed to generate audio: {str(e)}")
-def caption_my_image(image):
-    """
-    Generate caption for image and convert it to speech
-    """
-    try:
-        if image is None:
-            raise gr.Error("Please upload an image")
-        # Generate caption
-        captions = caption_image(images=image)
-        if not captions or len(captions) == 0:
-            raise gr.Error("Could not generate caption for this image")
-        caption_text = captions[0]['generated_text']
-        print(f"Generated caption: {caption_text}")
-        # Generate audio from caption
-        audio_path = generate_audio(caption_text)
-        return [audio_path, caption_text]
-    except Exception as e:
-        print(f"Error in caption_my_image: {str(e)}")
-        raise gr.Error(f"Failed to process image: {str(e)}")
-# Create the Gradio interface
-demo = gr.Interface(
-    fn=caption_my_image,
-    inputs=[
-        gr.Image(label="Upload Image", type="pil")
-    ],
-    outputs=[
-        gr.Audio(label="Generated Audio"),
-        gr.Textbox(label="Generated Caption")
-    ],
-    title="Image Captioning with Audio",
-    description="""
-    Upload an image and the application will:
-    1. Generate a descriptive caption for the image
-    2. Convert the caption to speech
-    """,
-    examples=[],
-    cache_examples=False
-)
-if __name__ == "__main__":
-    demo.launch()

 import torch
 import gradio as gr
 from PIL import Image
 import scipy.io.wavfile as wavfile
+# Use a pipeline as a high-level helper
+from transformers import pipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# model_path = ("../Models/models--Salesforce--blip-image-captioning-large"
+#               "/snapshots/2227ac38c9f16105cb0412e7cab4759978a8fd90")
+#
+# tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
+#                   "/3bcb8321394f671bd948ebf0d086d694dda95464")
 caption_image = pipeline("image-to-text",
+                model="Salesforce/blip-image-captioning-large", device=device)
+narrator = pipeline("text-to-speech",
+                    model="kakao-enterprise/vits-ljs")
+# caption_image = pipeline("image-to-text",
+#                 model=model_path, device=device)
+#
+# narrator = pipeline("text-to-speech",
+#                     model=tts_model_path)
 def generate_audio(text):
+    # Generate the narrated text
+    narrated_text = narrator(text)
+    # Save the audio to a WAV file
+    wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
+                  data=narrated_text["audio"][0])
+    # Return the path to the saved audio file
+    return "output.wav"
+def caption_my_image(pil_image):
+    semantics = caption_image(images=pil_image)[0]['generated_text']
+    return generate_audio(semantics)
+demo = gr.Interface(fn=caption_my_image,
+                    inputs=[gr.Image(label="Select Image",type="pil")],
+                    outputs=[gr.Audio(label="Image Caption")],
+                    title="@GenAILearniverse Project 8: Image Captioning",
+                    description="THIS APPLICATION WILL BE USED TO CAPTION THE IMAGE.")
+demo.launch()