Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,52 +1,63 @@
|
|
1 |
-
from transformers import
|
2 |
-
|
3 |
-
|
4 |
-
model="openai/whisper-small", # multilingual model
|
5 |
-
generate_kwargs={"task": "translate", "language": "es"}
|
6 |
-
)
|
7 |
-
|
8 |
-
|
9 |
import os
|
10 |
import gradio as gr
|
11 |
from pydub import AudioSegment
|
12 |
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
if filepath is None:
|
16 |
gr.Warning("No audio found, please retry.")
|
17 |
return ""
|
18 |
|
19 |
-
# Load audio using pydub
|
20 |
audio = AudioSegment.from_file(filepath)
|
21 |
chunk_length_ms = 30 * 1000 # 30 seconds
|
22 |
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
23 |
|
24 |
-
|
25 |
|
26 |
for i, chunk in enumerate(chunks):
|
27 |
-
# Export chunk to temporary wav file
|
28 |
chunk_path = f"chunk_{i}.wav"
|
29 |
chunk.export(chunk_path, format="wav")
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
# Clean up (optional)
|
36 |
os.remove(chunk_path)
|
37 |
|
38 |
-
return
|
39 |
-
|
|
|
40 |
mic_transcribe = gr.Interface(
|
41 |
-
fn=
|
42 |
inputs=gr.Audio(sources="microphone",
|
43 |
type="filepath"),
|
44 |
-
outputs=gr.Textbox(label="
|
45 |
lines=3),
|
46 |
allow_flagging="never")
|
47 |
|
48 |
file_transcribe = gr.Interface(
|
49 |
-
fn=
|
50 |
inputs=gr.Audio(sources="upload",
|
51 |
type="filepath"),
|
52 |
outputs=gr.Textbox(label="Translation (English to Spanish)",
|
|
|
1 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
+
import torchaudio
|
3 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
4 |
import os
|
5 |
import gradio as gr
|
6 |
from pydub import AudioSegment
|
7 |
|
8 |
+
# Load Whisper model and processor
|
9 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
|
10 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
11 |
+
|
12 |
+
# Get decoder prompts for translation to Spanish
|
13 |
+
forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
|
14 |
+
|
15 |
+
# Function to process and translate audio
|
16 |
+
def translate_audio(filepath):
|
17 |
if filepath is None:
|
18 |
gr.Warning("No audio found, please retry.")
|
19 |
return ""
|
20 |
|
21 |
+
# Load audio using pydub for chunking
|
22 |
audio = AudioSegment.from_file(filepath)
|
23 |
chunk_length_ms = 30 * 1000 # 30 seconds
|
24 |
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
25 |
|
26 |
+
full_translation = ""
|
27 |
|
28 |
for i, chunk in enumerate(chunks):
|
|
|
29 |
chunk_path = f"chunk_{i}.wav"
|
30 |
chunk.export(chunk_path, format="wav")
|
31 |
|
32 |
+
# Load chunk for model input
|
33 |
+
waveform, sample_rate = torchaudio.load(chunk_path)
|
34 |
+
inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
|
35 |
+
|
36 |
+
# Generate translated output
|
37 |
+
with torch.no_grad():
|
38 |
+
generated_ids = model.generate(
|
39 |
+
inputs["input_features"],
|
40 |
+
forced_decoder_ids=forced_decoder_ids
|
41 |
+
)
|
42 |
+
|
43 |
+
translation = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
44 |
+
full_translation += translation + " "
|
45 |
|
|
|
46 |
os.remove(chunk_path)
|
47 |
|
48 |
+
return full_translation.strip()
|
49 |
+
|
50 |
+
# Gradio UI components
|
51 |
mic_transcribe = gr.Interface(
|
52 |
+
fn=translate_audio,
|
53 |
inputs=gr.Audio(sources="microphone",
|
54 |
type="filepath"),
|
55 |
+
outputs=gr.Textbox(label="Translation (English to Spanish)",
|
56 |
lines=3),
|
57 |
allow_flagging="never")
|
58 |
|
59 |
file_transcribe = gr.Interface(
|
60 |
+
fn=translate_audio,
|
61 |
inputs=gr.Audio(sources="upload",
|
62 |
type="filepath"),
|
63 |
outputs=gr.Textbox(label="Translation (English to Spanish)",
|