Tamerstito commited on
Commit
867e47c
·
verified ·
1 Parent(s): 1cba309

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -22
app.py CHANGED
@@ -1,52 +1,63 @@
1
- from transformers import pipeline
2
- asr = pipeline(
3
- task="automatic-speech-recognition",
4
- model="openai/whisper-small", # multilingual model
5
- generate_kwargs={"task": "translate", "language": "es"}
6
- )
7
-
8
-
9
  import os
10
  import gradio as gr
11
  from pydub import AudioSegment
12
 
13
- demo = gr.Blocks()
14
- def transcribe_speech(filepath):
 
 
 
 
 
 
 
15
  if filepath is None:
16
  gr.Warning("No audio found, please retry.")
17
  return ""
18
 
19
- # Load audio using pydub
20
  audio = AudioSegment.from_file(filepath)
21
  chunk_length_ms = 30 * 1000 # 30 seconds
22
  chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
23
 
24
- full_transcription = ""
25
 
26
  for i, chunk in enumerate(chunks):
27
- # Export chunk to temporary wav file
28
  chunk_path = f"chunk_{i}.wav"
29
  chunk.export(chunk_path, format="wav")
30
 
31
- # Transcribe the chunk
32
- result = asr(chunk_path)
33
- full_transcription += result["text"] + " "
 
 
 
 
 
 
 
 
 
 
34
 
35
- # Clean up (optional)
36
  os.remove(chunk_path)
37
 
38
- return full_transcription.strip()
39
-
 
40
  mic_transcribe = gr.Interface(
41
- fn=transcribe_speech,
42
  inputs=gr.Audio(sources="microphone",
43
  type="filepath"),
44
- outputs=gr.Textbox(label="Transcription",
45
  lines=3),
46
  allow_flagging="never")
47
 
48
  file_transcribe = gr.Interface(
49
- fn=transcribe_speech,
50
  inputs=gr.Audio(sources="upload",
51
  type="filepath"),
52
  outputs=gr.Textbox(label="Translation (English to Spanish)",
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ import torchaudio
3
+ import torch
 
 
 
 
 
4
  import os
5
  import gradio as gr
6
  from pydub import AudioSegment
7
 
8
+ # Load Whisper model and processor
9
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
10
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
11
+
12
+ # Get decoder prompts for translation to Spanish
13
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="es", task="translate")
14
+
15
+ # Function to process and translate audio
16
+ def translate_audio(filepath):
17
  if filepath is None:
18
  gr.Warning("No audio found, please retry.")
19
  return ""
20
 
21
+ # Load audio using pydub for chunking
22
  audio = AudioSegment.from_file(filepath)
23
  chunk_length_ms = 30 * 1000 # 30 seconds
24
  chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
25
 
26
+ full_translation = ""
27
 
28
  for i, chunk in enumerate(chunks):
 
29
  chunk_path = f"chunk_{i}.wav"
30
  chunk.export(chunk_path, format="wav")
31
 
32
+ # Load chunk for model input
33
+ waveform, sample_rate = torchaudio.load(chunk_path)
34
+ inputs = processor(waveform[0], sampling_rate=sample_rate, return_tensors="pt")
35
+
36
+ # Generate translated output
37
+ with torch.no_grad():
38
+ generated_ids = model.generate(
39
+ inputs["input_features"],
40
+ forced_decoder_ids=forced_decoder_ids
41
+ )
42
+
43
+ translation = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
44
+ full_translation += translation + " "
45
 
 
46
  os.remove(chunk_path)
47
 
48
+ return full_translation.strip()
49
+
50
+ # Gradio UI components
51
  mic_transcribe = gr.Interface(
52
+ fn=translate_audio,
53
  inputs=gr.Audio(sources="microphone",
54
  type="filepath"),
55
+ outputs=gr.Textbox(label="Translation (English to Spanish)",
56
  lines=3),
57
  allow_flagging="never")
58
 
59
  file_transcribe = gr.Interface(
60
+ fn=translate_audio,
61
  inputs=gr.Audio(sources="upload",
62
  type="filepath"),
63
  outputs=gr.Textbox(label="Translation (English to Spanish)",