Muhammad Anas Akhtar commited on
Commit
4d3aad6
·
verified ·
1 Parent(s): e56a316

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -82
app.py CHANGED
@@ -1,97 +1,51 @@
1
  import torch
2
  import gradio as gr
3
  from PIL import Image
4
- import numpy as np
5
- import os
6
- from transformers import pipeline, AutoProcessor, AutoModelForCausalLM
7
  import scipy.io.wavfile as wavfile
8
 
9
- # Set device
 
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
- print(f"Using device: {device}")
12
 
13
- # Initialize the image captioning pipeline
 
 
 
 
 
14
  caption_image = pipeline("image-to-text",
15
- model="Salesforce/blip-image-captioning-large",
16
- device=device)
17
 
18
- # Initialize TTS with Coqui TTS
19
- try:
20
- from TTS.api import TTS
21
- tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
22
- except ImportError:
23
- print("Installing TTS...")
24
- import subprocess
25
- subprocess.check_call(["pip", "install", "TTS"])
26
- from TTS.api import TTS
27
- tts = TTS("tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
28
 
29
- def ensure_output_dir():
30
- """Ensure the output directory exists"""
31
- output_dir = os.path.join(os.path.expanduser("~"), "AudioCaptions")
32
- os.makedirs(output_dir, exist_ok=True)
33
- return output_dir
34
 
35
  def generate_audio(text):
36
- """
37
- Generate audio from text and save it
38
- """
39
- try:
40
- # Create output directory and file path
41
- output_dir = ensure_output_dir()
42
- output_path = os.path.join(output_dir, "caption_audio.wav")
43
-
44
- # Generate speech using Coqui TTS
45
- tts.tts_to_file(text=text, file_path=output_path)
46
-
47
- return output_path
48
- except Exception as e:
49
- print(f"Error generating audio: {str(e)}")
50
- raise gr.Error(f"Failed to generate audio: {str(e)}")
51
 
52
- def caption_my_image(image):
53
- """
54
- Generate caption for image and convert it to speech
55
- """
56
- try:
57
- if image is None:
58
- raise gr.Error("Please upload an image")
59
-
60
- # Generate caption
61
- captions = caption_image(images=image)
62
- if not captions or len(captions) == 0:
63
- raise gr.Error("Could not generate caption for this image")
64
-
65
- caption_text = captions[0]['generated_text']
66
- print(f"Generated caption: {caption_text}")
67
-
68
- # Generate audio from caption
69
- audio_path = generate_audio(caption_text)
70
-
71
- return [audio_path, caption_text]
72
- except Exception as e:
73
- print(f"Error in caption_my_image: {str(e)}")
74
- raise gr.Error(f"Failed to process image: {str(e)}")
75
 
76
- # Create the Gradio interface
77
- demo = gr.Interface(
78
- fn=caption_my_image,
79
- inputs=[
80
- gr.Image(label="Upload Image", type="pil")
81
- ],
82
- outputs=[
83
- gr.Audio(label="Generated Audio"),
84
- gr.Textbox(label="Generated Caption")
85
- ],
86
- title="Image Captioning with Audio",
87
- description="""
88
- Upload an image and the application will:
89
- 1. Generate a descriptive caption for the image
90
- 2. Convert the caption to speech
91
- """,
92
- examples=[],
93
- cache_examples=False
94
- )
95
 
96
- if __name__ == "__main__":
97
- demo.launch()
 
1
  import torch
2
  import gradio as gr
3
  from PIL import Image
 
 
 
4
  import scipy.io.wavfile as wavfile
5
 
6
+ # Use a pipeline as a high-level helper
7
+ from transformers import pipeline
8
+
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
10
 
11
+ # model_path = ("../Models/models--Salesforce--blip-image-captioning-large"
12
+ # "/snapshots/2227ac38c9f16105cb0412e7cab4759978a8fd90")
13
+ #
14
+ # tts_model_path = ("../Models/models--kakao-enterprise--vits-ljs/snapshots"
15
+ # "/3bcb8321394f671bd948ebf0d086d694dda95464")
16
+
17
  caption_image = pipeline("image-to-text",
18
+ model="Salesforce/blip-image-captioning-large", device=device)
 
19
 
20
+ narrator = pipeline("text-to-speech",
21
+ model="kakao-enterprise/vits-ljs")
 
 
 
 
 
 
 
 
22
 
23
+ # caption_image = pipeline("image-to-text",
24
+ # model=model_path, device=device)
25
+ #
26
+ # narrator = pipeline("text-to-speech",
27
+ # model=tts_model_path)
28
 
29
  def generate_audio(text):
30
+ # Generate the narrated text
31
+ narrated_text = narrator(text)
32
+
33
+ # Save the audio to a WAV file
34
+ wavfile.write("output.wav", rate=narrated_text["sampling_rate"],
35
+ data=narrated_text["audio"][0])
36
+ # Return the path to the saved audio file
37
+ return "output.wav"
38
+
39
+
40
+ def caption_my_image(pil_image):
41
+ semantics = caption_image(images=pil_image)[0]['generated_text']
42
+ return generate_audio(semantics)
 
 
43
 
44
+ demo = gr.Interface(fn=caption_my_image,
45
+ inputs=[gr.Image(label="Select Image",type="pil")],
46
+ outputs=[gr.Audio(label="Image Caption")],
47
+ title="@GenAILearniverse Project 8: Image Captioning",
48
+ description="THIS APPLICATION WILL BE USED TO CAPTION THE IMAGE.")
49
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51