Muhammad Anas Akhtar commited on
Commit
c1309e4
·
verified ·
1 Parent(s): 83ed3dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -16
app.py CHANGED
@@ -3,23 +3,28 @@ import gradio as gr
3
  from PIL import Image
4
  import numpy as np
5
  import os
6
-
7
- # Use a pipeline as a high-level helper
8
- from transformers import pipeline
9
 
10
  # Set device
11
  device = "cuda" if torch.cuda.is_available() else "cpu"
12
  print(f"Using device: {device}")
13
 
14
- # Initialize the pipelines
15
  caption_image = pipeline("image-to-text",
16
  model="Salesforce/blip-image-captioning-large",
17
  device=device)
18
 
19
- # Using a different TTS model that's more stable
20
- narrator = pipeline("text-to-speech",
21
- model="microsoft/speecht5_tts",
22
- device=device)
 
 
 
 
 
 
23
 
24
  def ensure_output_dir():
25
  """Ensure the output directory exists"""
@@ -32,16 +37,12 @@ def generate_audio(text):
32
  Generate audio from text and save it
33
  """
34
  try:
35
- # Generate the speech
36
- speech = narrator(text)
37
-
38
  # Create output directory and file path
39
  output_dir = ensure_output_dir()
40
  output_path = os.path.join(output_dir, "caption_audio.wav")
41
 
42
- # Save the audio file
43
- with open(output_path, "wb") as f:
44
- f.write(speech["audio"])
45
 
46
  return output_path
47
  except Exception as e:
@@ -82,7 +83,7 @@ demo = gr.Interface(
82
  gr.Audio(label="Generated Audio"),
83
  gr.Textbox(label="Generated Caption")
84
  ],
85
- title="Image Captioning with Audio",
86
  description="""
87
  Upload an image and the application will:
88
  1. Generate a descriptive caption for the image
@@ -93,4 +94,4 @@ demo = gr.Interface(
93
  )
94
 
95
  if __name__ == "__main__":
96
- demo.launch()
 
3
  from PIL import Image
4
  import numpy as np
5
  import os
6
+ from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
7
+ import scipy.io.wavfile as wavfile
 
8
 
9
  # Set device
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  print(f"Using device: {device}")
12
 
13
+ # Initialize the image captioning pipeline
14
  caption_image = pipeline("image-to-text",
15
  model="Salesforce/blip-image-captioning-large",
16
  device=device)
17
 
18
+ # Initialize TTS with Coqui TTS
19
+ try:
20
+ from TTS.api import TTS
21
+ tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
22
+ except ImportError:
23
+ print("Installing TTS...")
24
+ import subprocess
25
+ subprocess.check_call(["pip", "install", "TTS"])
26
+ from TTS.api import TTS
27
+ tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
28
 
29
  def ensure_output_dir():
30
  """Ensure the output directory exists"""
 
37
  Generate audio from text and save it
38
  """
39
  try:
 
 
 
40
  # Create output directory and file path
41
  output_dir = ensure_output_dir()
42
  output_path = os.path.join(output_dir, "caption_audio.wav")
43
 
44
+ # Generate speech using Coqui TTS
45
+ tts.tts_to_file(text=text, file_path=output_path)
 
46
 
47
  return output_path
48
  except Exception as e:
 
83
  gr.Audio(label="Generated Audio"),
84
  gr.Textbox(label="Generated Caption")
85
  ],
86
+ title="@GenAILearniverse Project: Image Captioning with Audio",
87
  description="""
88
  Upload an image and the application will:
89
  1. Generate a descriptive caption for the image
 
94
  )
95
 
96
  if __name__ == "__main__":
97
+ demo.launch()