Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import soundfile as sf
|
|
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer
|
5 |
from gtts import gTTS
|
6 |
import gradio as gr
|
@@ -29,10 +30,23 @@ def transcribe_audio(audio_path: str) -> str:
|
|
29 |
Transcreve o áudio para texto usando o modelo Whisper.
|
30 |
"""
|
31 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
|
33 |
model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL)
|
34 |
-
audio, _ = sf.read(audio_path)
|
35 |
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
|
|
|
|
|
36 |
predicted_ids = model.generate(input_features)
|
37 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
38 |
return transcription
|
|
|
1 |
import os
|
2 |
import subprocess
|
3 |
import soundfile as sf
|
4 |
+
import librosa
|
5 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer
|
6 |
from gtts import gTTS
|
7 |
import gradio as gr
|
|
|
30 |
Transcreve o áudio para texto usando o modelo Whisper.
|
31 |
"""
|
32 |
try:
|
33 |
+
# Carregar o áudio
|
34 |
+
audio, sample_rate = sf.read(audio_path)
|
35 |
+
|
36 |
+
# Verificar se o áudio está no formato correto
|
37 |
+
if len(audio.shape) > 1: # Se for stereo, converter para mono
|
38 |
+
audio = audio.mean(axis=1)
|
39 |
+
|
40 |
+
# Redimensionar para 16 kHz, se necessário
|
41 |
+
if sample_rate != 16000:
|
42 |
+
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
|
43 |
+
|
44 |
+
# Processar o áudio
|
45 |
processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
|
46 |
model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL)
|
|
|
47 |
input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
|
48 |
+
|
49 |
+
# Gerar transcrição
|
50 |
predicted_ids = model.generate(input_features)
|
51 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
52 |
return transcription
|