EwoutLagendijk commited on
Commit
efa7028
1 Parent(s): cd7c511

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -28
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import pipeline
3
  from transformers.pipelines.audio_utils import ffmpeg_read
4
  import gradio as gr
5
  import librosa
@@ -9,34 +9,14 @@ BATCH_SIZE = 8
9
 
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
12
- pipe = pipeline(
13
- task="automatic-speech-recognition",
14
- model=MODEL_NAME,
15
- chunk_length_s=30,
16
- device=device,
17
- )
18
-
19
-
20
- # Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
21
- def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
22
- if seconds is not None:
23
- milliseconds = round(seconds * 1000.0)
24
-
25
- hours = milliseconds // 3_600_000
26
- milliseconds -= hours * 3_600_000
27
-
28
- minutes = milliseconds // 60_000
29
- milliseconds -= minutes * 60_000
30
-
31
- seconds = milliseconds // 1_000
32
- milliseconds -= seconds * 1_000
33
 
34
- hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
35
- return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
36
- else:
37
- # we have a malformed timestamp so just return it as is
38
- return seconds
39
 
 
 
40
 
41
  def transcribe_speech(filepath):
42
  # Load the audio
@@ -58,7 +38,8 @@ def transcribe_speech(filepath):
58
  generated_ids = model.generate(
59
  inputs,
60
  max_new_tokens=444, # Max allowed by Whisper
61
- forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe")
 
62
  )
63
 
64
  # Decode and append the transcription
 
1
  import torch
2
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3
  from transformers.pipelines.audio_utils import ffmpeg_read
4
  import gradio as gr
5
  import librosa
 
9
 
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
12
+ # Load model and processor
13
+ model_name = "EwoutLagendijk/whisper-small-indonesian"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
16
+ processor = AutoProcessor.from_pretrained(model_name)
 
 
 
17
 
18
+ # Update the generation config for transcription
19
+ model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="id", task="transcribe")
20
 
21
  def transcribe_speech(filepath):
22
  # Load the audio
 
38
  generated_ids = model.generate(
39
  inputs,
40
  max_new_tokens=444, # Max allowed by Whisper
41
+ forced_decoder_ids=processor.get_decoder_prompt_ids(language="id", task="transcribe"),
42
+ return_timestamps = True
43
  )
44
 
45
  # Decode and append the transcription