Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,31 +1,37 @@
|
|
1 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
import torchaudio
|
|
|
3 |
import torch
|
4 |
import os
|
5 |
import gradio as gr
|
6 |
from pydub import AudioSegment
|
7 |
-
|
8 |
-
# Load Whisper model and processor
|
9 |
-
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
|
10 |
-
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
11 |
-
|
12 |
-
# Get decoder prompts for translation to Spanish
|
13 |
-
forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
|
14 |
-
|
15 |
-
# Function to process and translate audio
|
16 |
import traceback
|
17 |
|
|
|
|
|
|
|
|
|
|
|
18 |
def translate_audio(filepath):
|
|
|
19 |
try:
|
20 |
print("Received filepath:", filepath)
|
21 |
|
22 |
if filepath is None or not os.path.exists(filepath):
|
23 |
return "No audio file received or file does not exist."
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
audio = AudioSegment.from_file(filepath)
|
26 |
print("Audio loaded. Duration (ms):", len(audio))
|
27 |
|
28 |
-
chunk_length_ms = 30 * 1000
|
29 |
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
30 |
print(f"Audio split into {len(chunks)} chunks.")
|
31 |
|
@@ -37,7 +43,13 @@ def translate_audio(filepath):
|
|
37 |
print(f"Exported chunk {i} to {chunk_path}")
|
38 |
|
39 |
waveform, sample_rate = torchaudio.load(chunk_path)
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
|
43 |
|
@@ -76,9 +88,8 @@ file_transcribe = gr.Interface(
|
|
76 |
allow_flagging="never"
|
77 |
)
|
78 |
|
79 |
-
#
|
80 |
demo = gr.Blocks()
|
81 |
-
|
82 |
with demo:
|
83 |
gr.TabbedInterface(
|
84 |
[mic_transcribe, file_transcribe],
|
|
|
1 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
import torchaudio
|
3 |
+
import torchaudio.transforms as T
|
4 |
import torch
|
5 |
import os
|
6 |
import gradio as gr
|
7 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import traceback
|
9 |
|
10 |
+
# Lazy-loaded globals
|
11 |
+
model = None
|
12 |
+
processor = None
|
13 |
+
forced_decoder_ids = None
|
14 |
+
|
15 |
def translate_audio(filepath):
|
16 |
+
global model, processor, forced_decoder_ids
|
17 |
try:
|
18 |
print("Received filepath:", filepath)
|
19 |
|
20 |
if filepath is None or not os.path.exists(filepath):
|
21 |
return "No audio file received or file does not exist."
|
22 |
|
23 |
+
# Lazy load model and processor to reduce startup load time
|
24 |
+
if model is None:
|
25 |
+
print("Loading Whisper model...")
|
26 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
|
27 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
28 |
+
forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
|
29 |
+
print("Model loaded.")
|
30 |
+
|
31 |
audio = AudioSegment.from_file(filepath)
|
32 |
print("Audio loaded. Duration (ms):", len(audio))
|
33 |
|
34 |
+
chunk_length_ms = 30 * 1000 # 30 seconds
|
35 |
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
36 |
print(f"Audio split into {len(chunks)} chunks.")
|
37 |
|
|
|
43 |
print(f"Exported chunk {i} to {chunk_path}")
|
44 |
|
45 |
waveform, sample_rate = torchaudio.load(chunk_path)
|
46 |
+
|
47 |
+
# Resample if needed
|
48 |
+
if sample_rate != 16000:
|
49 |
+
print(f"Resampling from {sample_rate} Hz to 16000 Hz")
|
50 |
+
resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
|
51 |
+
waveform = resampler(waveform)
|
52 |
+
sample_rate = 16000
|
53 |
|
54 |
inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
|
55 |
|
|
|
88 |
allow_flagging="never"
|
89 |
)
|
90 |
|
91 |
+
# Create tabbed demo
|
92 |
demo = gr.Blocks()
|
|
|
93 |
with demo:
|
94 |
gr.TabbedInterface(
|
95 |
[mic_transcribe, file_transcribe],
|