Tamerstito commited on
Commit
34f8d61
·
verified ·
1 Parent(s): c97e116

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -13
app.py CHANGED
@@ -1,31 +1,37 @@
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import torchaudio
 
3
  import torch
4
  import os
5
  import gradio as gr
6
  from pydub import AudioSegment
7
-
8
- # Load Whisper model and processor
9
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
10
- processor = WhisperProcessor.from_pretrained("openai/whisper-small")
11
-
12
- # Get decoder prompts for translation to Spanish
13
- forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
14
-
15
- # Function to process and translate audio
16
  import traceback
17
 
 
 
 
 
 
18
  def translate_audio(filepath):
 
19
  try:
20
  print("Received filepath:", filepath)
21
 
22
  if filepath is None or not os.path.exists(filepath):
23
  return "No audio file received or file does not exist."
24
 
 
 
 
 
 
 
 
 
25
  audio = AudioSegment.from_file(filepath)
26
  print("Audio loaded. Duration (ms):", len(audio))
27
 
28
- chunk_length_ms = 30 * 1000
29
  chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
30
  print(f"Audio split into {len(chunks)} chunks.")
31
 
@@ -37,7 +43,13 @@ def translate_audio(filepath):
37
  print(f"Exported chunk {i} to {chunk_path}")
38
 
39
  waveform, sample_rate = torchaudio.load(chunk_path)
40
- print(f"Loaded chunk {i} with sample rate {sample_rate}")
 
 
 
 
 
 
41
 
42
  inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
43
 
@@ -76,9 +88,8 @@ file_transcribe = gr.Interface(
76
  allow_flagging="never"
77
  )
78
 
79
- # Initialize Blocks properly
80
  demo = gr.Blocks()
81
-
82
  with demo:
83
  gr.TabbedInterface(
84
  [mic_transcribe, file_transcribe],
 
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  import torchaudio
3
+ import torchaudio.transforms as T
4
  import torch
5
  import os
6
  import gradio as gr
7
  from pydub import AudioSegment
 
 
 
 
 
 
 
 
 
8
  import traceback
9
 
10
+ # Lazy-loaded globals
11
+ model = None
12
+ processor = None
13
+ forced_decoder_ids = None
14
+
15
  def translate_audio(filepath):
16
+ global model, processor, forced_decoder_ids
17
  try:
18
  print("Received filepath:", filepath)
19
 
20
  if filepath is None or not os.path.exists(filepath):
21
  return "No audio file received or file does not exist."
22
 
23
+ # Lazy load model and processor to reduce startup load time
24
+ if model is None:
25
+ print("Loading Whisper model...")
26
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
27
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
28
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
29
+ print("Model loaded.")
30
+
31
  audio = AudioSegment.from_file(filepath)
32
  print("Audio loaded. Duration (ms):", len(audio))
33
 
34
+ chunk_length_ms = 30 * 1000 # 30 seconds
35
  chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
36
  print(f"Audio split into {len(chunks)} chunks.")
37
 
 
43
  print(f"Exported chunk {i} to {chunk_path}")
44
 
45
  waveform, sample_rate = torchaudio.load(chunk_path)
46
+
47
+ # Resample if needed
48
+ if sample_rate != 16000:
49
+ print(f"Resampling from {sample_rate} Hz to 16000 Hz")
50
+ resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
51
+ waveform = resampler(waveform)
52
+ sample_rate = 16000
53
 
54
  inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
55
 
 
88
  allow_flagging="never"
89
  )
90
 
91
+ # Create tabbed demo
92
  demo = gr.Blocks()
 
93
  with demo:
94
  gr.TabbedInterface(
95
  [mic_transcribe, file_transcribe],