Muhammad Anas Akhtar
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -3,23 +3,28 @@ import gradio as gr
|
|
3 |
from PIL import Image
|
4 |
import numpy as np
|
5 |
import os
|
6 |
-
|
7 |
-
|
8 |
-
from transformers import pipeline
|
9 |
|
10 |
# Set device
|
11 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
12 |
print(f"Using device: {device}")
|
13 |
|
14 |
-
# Initialize the
|
15 |
caption_image = pipeline("image-to-text",
|
16 |
model="Salesforce/blip-image-captioning-large",
|
17 |
device=device)
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
def ensure_output_dir():
|
25 |
"""Ensure the output directory exists"""
|
@@ -32,16 +37,12 @@ def generate_audio(text):
|
|
32 |
Generate audio from text and save it
|
33 |
"""
|
34 |
try:
|
35 |
-
# Generate the speech
|
36 |
-
speech = narrator(text)
|
37 |
-
|
38 |
# Create output directory and file path
|
39 |
output_dir = ensure_output_dir()
|
40 |
output_path = os.path.join(output_dir, "caption_audio.wav")
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
-
f.write(speech["audio"])
|
45 |
|
46 |
return output_path
|
47 |
except Exception as e:
|
@@ -82,7 +83,7 @@ demo = gr.Interface(
|
|
82 |
gr.Audio(label="Generated Audio"),
|
83 |
gr.Textbox(label="Generated Caption")
|
84 |
],
|
85 |
-
title="Image Captioning with Audio",
|
86 |
description="""
|
87 |
Upload an image and the application will:
|
88 |
1. Generate a descriptive caption for the image
|
@@ -93,4 +94,4 @@ demo = gr.Interface(
|
|
93 |
)
|
94 |
|
95 |
if __name__ == "__main__":
|
96 |
-
demo.launch()
|
|
|
3 |
from PIL import Image
|
4 |
import numpy as np
|
5 |
import os
|
6 |
+
from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
|
7 |
+
import scipy.io.wavfile as wavfile
|
|
|
8 |
|
9 |
# Set device
|
10 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
11 |
print(f"Using device: {device}")
|
12 |
|
13 |
+
# Initialize the image captioning pipeline
|
14 |
caption_image = pipeline("image-to-text",
|
15 |
model="Salesforce/blip-image-captioning-large",
|
16 |
device=device)
|
17 |
|
18 |
+
# Initialize TTS with Coqui TTS
|
19 |
+
try:
|
20 |
+
from TTS.api import TTS
|
21 |
+
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
22 |
+
except ImportError:
|
23 |
+
print("Installing TTS...")
|
24 |
+
import subprocess
|
25 |
+
subprocess.check_call(["pip", "install", "TTS"])
|
26 |
+
from TTS.api import TTS
|
27 |
+
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
28 |
|
29 |
def ensure_output_dir():
|
30 |
"""Ensure the output directory exists"""
|
|
|
37 |
Generate audio from text and save it
|
38 |
"""
|
39 |
try:
|
|
|
|
|
|
|
40 |
# Create output directory and file path
|
41 |
output_dir = ensure_output_dir()
|
42 |
output_path = os.path.join(output_dir, "caption_audio.wav")
|
43 |
|
44 |
+
# Generate speech using Coqui TTS
|
45 |
+
tts.tts_to_file(text=text, file_path=output_path)
|
|
|
46 |
|
47 |
return output_path
|
48 |
except Exception as e:
|
|
|
83 |
gr.Audio(label="Generated Audio"),
|
84 |
gr.Textbox(label="Generated Caption")
|
85 |
],
|
86 |
+
title="@GenAILearniverse Project: Image Captioning with Audio",
|
87 |
description="""
|
88 |
Upload an image and the application will:
|
89 |
1. Generate a descriptive caption for the image
|
|
|
94 |
)
|
95 |
|
96 |
if __name__ == "__main__":
|
97 |
+
demo.launch()
|