EwoutLagendijk commited on
Commit
54eca9b
·
verified ·
1 Parent(s): efa7028

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -23
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import torch
2
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
- from transformers.pipelines.audio_utils import ffmpeg_read
4
  import gradio as gr
5
  import librosa
6
 
@@ -10,15 +9,14 @@ BATCH_SIZE = 8
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
12
  # Load model and processor
13
- model_name = "EwoutLagendijk/whisper-small-indonesian"
14
-
15
- model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
16
- processor = AutoProcessor.from_pretrained(model_name)
17
 
18
  # Update the generation config for transcription
19
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="id", task="transcribe")
 
20
 
21
- def transcribe_speech(filepath):
22
  # Load the audio
23
  audio, sampling_rate = librosa.load(filepath, sr=16000)
24
 
@@ -30,6 +28,7 @@ def transcribe_speech(filepath):
30
  transcription = []
31
  for i in range(0, len(audio), chunk_samples):
32
  chunk = audio[i:i + chunk_samples]
 
33
 
34
  # Convert the chunk into input features
35
  inputs = processor(audio=chunk, sampling_rate=16000, return_tensors="pt").input_features
@@ -37,35 +36,42 @@ def transcribe_speech(filepath):
37
  # Generate transcription for the chunk
38
  generated_ids = model.generate(
39
  inputs,
40
- max_new_tokens=444, # Max allowed by Whisper
41
- forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe"),
42
- return_timestamps = True
 
 
 
43
  )
44
 
45
- # Decode and append the transcription
46
- chunk_transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
47
- transcription.append(chunk_transcription)
48
-
49
- # Combine all chunk transcriptions into a single string
50
- return " ".join(transcription)
51
-
52
-
 
 
 
 
53
 
54
  demo = gr.Blocks()
55
 
56
  mic_transcribe = gr.Interface(
57
- fn=transcribe_speech,
58
  inputs=gr.Audio(sources="microphone", type="filepath"),
59
- outputs=gr.components.Textbox(),
60
  )
61
 
62
  file_transcribe = gr.Interface(
63
- fn=transcribe_speech,
64
  inputs=gr.Audio(sources="upload", type="filepath"),
65
- outputs=gr.components.Textbox(),
66
  )
67
 
68
  with demo:
69
  gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"])
70
 
71
- demo.launch(debug=True)
 
1
  import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
 
3
  import gradio as gr
4
  import librosa
5
 
 
9
  device = 0 if torch.cuda.is_available() else "cpu"
10
 
11
  # Load model and processor
12
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_NAME)
13
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
 
 
14
 
15
  # Update the generation config for transcription
16
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="id", task="transcribe")
17
+ model.config.no_repeat_ngram_size = 3
18
 
19
+ def transcribe_speech_with_timestamps(filepath):
20
  # Load the audio
21
  audio, sampling_rate = librosa.load(filepath, sr=16000)
22
 
 
28
  transcription = []
29
  for i in range(0, len(audio), chunk_samples):
30
  chunk = audio[i:i + chunk_samples]
31
+ chunk_start_time = i / sampling_rate # Calculate chunk start time in seconds
32
 
33
  # Convert the chunk into input features
34
  inputs = processor(audio=chunk, sampling_rate=16000, return_tensors="pt").input_features
 
36
  # Generate transcription for the chunk
37
  generated_ids = model.generate(
38
  inputs,
39
+ max_new_tokens=444,
40
+ return_dict_in_generate=True,
41
+ output_scores=False,
42
+ output_attentions=False,
43
+ output_hidden_states=False,
44
+ forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe")
45
  )
46
 
47
+ # Decode the tokens into text and timestamps
48
+ token_transcriptions = processor.batch_decode(generated_ids["sequences"], skip_special_tokens=False)[0]
49
+ decoded_with_timestamps = processor.decode_with_timestamps(generated_ids["sequences"][0])
50
+
51
+ # Parse timestamps and transcription
52
+ for segment in decoded_with_timestamps:
53
+ start_time = chunk_start_time + segment['start']
54
+ end_time = chunk_start_time + segment['end']
55
+ text = segment['text']
56
+ transcription.append(f"[{start_time:.2f}s - {end_time:.2f}s]: {text}")
57
+
58
+ return "\n".join(transcription)
59
 
60
  demo = gr.Blocks()
61
 
62
  mic_transcribe = gr.Interface(
63
+ fn=transcribe_speech_with_timestamps,
64
  inputs=gr.Audio(sources="microphone", type="filepath"),
65
+ outputs=gr.Textbox(lines=10, label="Transcription with Timestamps"),
66
  )
67
 
68
  file_transcribe = gr.Interface(
69
+ fn=transcribe_speech_with_timestamps,
70
  inputs=gr.Audio(sources="upload", type="filepath"),
71
+ outputs=gr.Textbox(lines=10, label="Transcription with Timestamps"),
72
  )
73
 
74
  with demo:
75
  gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"])
76
 
77
+ demo.launch(debug=True)