RXTIME commited on
Commit
5259a84
·
verified ·
1 Parent(s): f50c959

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -1
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import subprocess
3
  import soundfile as sf
 
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer
5
  from gtts import gTTS
6
  import gradio as gr
@@ -29,10 +30,23 @@ def transcribe_audio(audio_path: str) -> str:
29
  Transcreve o áudio para texto usando o modelo Whisper.
30
  """
31
  try:
 
 
 
 
 
 
 
 
 
 
 
 
32
  processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
33
  model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL)
34
- audio, _ = sf.read(audio_path)
35
  input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
 
 
36
  predicted_ids = model.generate(input_features)
37
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
38
  return transcription
 
1
  import os
2
  import subprocess
3
  import soundfile as sf
4
+ import librosa
5
  from transformers import WhisperProcessor, WhisperForConditionalGeneration, MarianMTModel, MarianTokenizer
6
  from gtts import gTTS
7
  import gradio as gr
 
30
  Transcreve o áudio para texto usando o modelo Whisper.
31
  """
32
  try:
33
+ # Carregar o áudio
34
+ audio, sample_rate = sf.read(audio_path)
35
+
36
+ # Verificar se o áudio está no formato correto
37
+ if len(audio.shape) > 1: # Se for stereo, converter para mono
38
+ audio = audio.mean(axis=1)
39
+
40
+ # Redimensionar para 16 kHz, se necessário
41
+ if sample_rate != 16000:
42
+ audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
43
+
44
+ # Processar o áudio
45
  processor = WhisperProcessor.from_pretrained(WHISPER_MODEL)
46
  model = WhisperForConditionalGeneration.from_pretrained(WHISPER_MODEL)
 
47
  input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
48
+
49
+ # Gerar transcrição
50
  predicted_ids = model.generate(input_features)
51
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
52
  return transcription