Bils commited on
Commit
a6e39ab
·
verified ·
1 Parent(s): 9f4cca0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -21
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import tempfile
3
  import gradio as gr
@@ -9,32 +10,32 @@ from transformers import pipeline
9
  from pathlib import Path
10
 
11
  load_dotenv()
12
-
13
  hf_token = os.getenv("HF_TKN")
14
 
 
15
  captioning_pipeline = pipeline(
16
- "image-to-text",
17
- model="nlpconnect/vit-gpt2-image-captioning",
18
  )
19
 
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
  pipe = DiffusionPipeline.from_pretrained(
22
- "cvssp/audioldm2",
23
- use_auth_token=hf_token
24
  )
25
- pipe = pipe.to(device)
26
 
 
27
  def analyze_image_with_free_model(image_file):
28
- """
29
- Analyzes an uploaded image using a free Hugging Face model for image captioning.
30
- Returns: (caption_text, is_error_flag)
31
- """
32
  try:
 
 
33
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
34
  temp_file.write(image_file)
35
  temp_image_path = temp_file.name
36
 
37
  results = captioning_pipeline(temp_image_path)
 
 
 
38
  if not results or not isinstance(results, list):
39
  return "Error: Could not generate caption.", True
40
 
@@ -44,20 +45,19 @@ def analyze_image_with_free_model(image_file):
44
  return caption, False
45
 
46
  except Exception as e:
47
- print(f"Error analyzing image: {e}")
48
  return f"Error analyzing image: {e}", True
49
 
 
50
  def get_audioldm_from_caption(caption):
51
- """
52
- Generates sound from a caption using the AudioLDM-2 model.
53
- Returns the filename (path) of the generated .wav file.
54
- """
55
  try:
 
 
56
  audio_output = pipe(
57
  prompt=caption,
58
  num_inference_steps=50,
59
  guidance_scale=7.5
60
  )
 
61
  audio = audio_output.audios[0]
62
 
63
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
@@ -82,7 +82,7 @@ with gr.Blocks(css=css) as demo:
82
  🎶 Generate Sound Effects from Image
83
  </h1>
84
  <p style="text-align: center;">
85
- Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
86
  </p>
87
  """)
88
 
@@ -101,11 +101,10 @@ with gr.Blocks(css=css) as demo:
101
 
102
  image_upload = gr.File(label="Upload Image", type="binary")
103
  generate_description_button = gr.Button("Generate Description")
104
- caption_display = gr.Textbox(label="Image Description", interactive=False) # Keep read-only
105
  generate_sound_button = gr.Button("Generate Sound Effect")
106
  audio_output = gr.Audio(label="Generated Sound Effect")
107
 
108
- # Extra footer
109
  gr.Markdown("""
110
  ## 👥 How You Can Contribute
111
  We welcome contributions and suggestions for improvements. Your feedback is invaluable
@@ -125,12 +124,12 @@ with gr.Blocks(css=css) as demo:
125
  """)
126
 
127
  def update_caption(image_file):
128
- description, error_flag = analyze_image_with_free_model(image_file)
129
  return description
130
 
131
  def generate_sound(description):
132
  if not description or description.startswith("Error"):
133
- return None
134
  audio_path = get_audioldm_from_caption(description)
135
  return audio_path
136
 
 
1
+ import spaces
2
  import os
3
  import tempfile
4
  import gradio as gr
 
10
  from pathlib import Path
11
 
12
  load_dotenv()
 
13
  hf_token = os.getenv("HF_TKN")
14
 
15
+ # Initialize pipelines globally (in CPU mode)
16
  captioning_pipeline = pipeline(
17
+ "image-to-text",
18
+ model="nlpconnect/vit-gpt2-image-captioning"
19
  )
20
 
 
21
  pipe = DiffusionPipeline.from_pretrained(
22
+ "cvssp/audioldm2",
23
+ use_auth_token=hf_token
24
  )
 
25
 
26
+ @spaces.GPU(duration=120)
27
  def analyze_image_with_free_model(image_file):
 
 
 
 
28
  try:
29
+ # Move captioning pipeline to GPU
30
+ captioning_pipeline.to("cuda")
31
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as temp_file:
32
  temp_file.write(image_file)
33
  temp_image_path = temp_file.name
34
 
35
  results = captioning_pipeline(temp_image_path)
36
+ # Move back to CPU (optional)
37
+ captioning_pipeline.to("cpu")
38
+
39
  if not results or not isinstance(results, list):
40
  return "Error: Could not generate caption.", True
41
 
 
45
  return caption, False
46
 
47
  except Exception as e:
 
48
  return f"Error analyzing image: {e}", True
49
 
50
+ @spaces.GPU(duration=120)
51
  def get_audioldm_from_caption(caption):
 
 
 
 
52
  try:
53
+ # Move AudioLDM pipeline to GPU
54
+ pipe.to("cuda")
55
  audio_output = pipe(
56
  prompt=caption,
57
  num_inference_steps=50,
58
  guidance_scale=7.5
59
  )
60
+ pipe.to("cpu")
61
  audio = audio_output.audios[0]
62
 
63
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
 
82
  🎶 Generate Sound Effects from Image
83
  </h1>
84
  <p style="text-align: center;">
85
+ Powered by <a href="https://bilsimaging.com" target="_blank">Bilsimaging</a>
86
  </p>
87
  """)
88
 
 
101
 
102
  image_upload = gr.File(label="Upload Image", type="binary")
103
  generate_description_button = gr.Button("Generate Description")
104
+ caption_display = gr.Textbox(label="Image Description", interactive=False)
105
  generate_sound_button = gr.Button("Generate Sound Effect")
106
  audio_output = gr.Audio(label="Generated Sound Effect")
107
 
 
108
  gr.Markdown("""
109
  ## 👥 How You Can Contribute
110
  We welcome contributions and suggestions for improvements. Your feedback is invaluable
 
124
  """)
125
 
126
  def update_caption(image_file):
127
+ description, _ = analyze_image_with_free_model(image_file)
128
  return description
129
 
130
  def generate_sound(description):
131
  if not description or description.startswith("Error"):
132
+ return None
133
  audio_path = get_audioldm_from_caption(description)
134
  return audio_path
135