riteshkr commited on
Commit
b0c50f6
·
verified ·
1 Parent(s): 85743c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -33
app.py CHANGED
@@ -8,7 +8,6 @@ BATCH_SIZE = 8
8
 
9
  device = 0 if torch.cuda.is_available() else "cpu"
10
 
11
- # Initialize the pipeline
12
  pipe = pipeline(
13
  task="automatic-speech-recognition",
14
  model=MODEL_NAME,
@@ -16,20 +15,28 @@ pipe = pipeline(
16
  device=device,
17
  )
18
 
 
 
19
  def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
20
  if seconds is not None:
21
  milliseconds = round(seconds * 1000.0)
 
22
  hours = milliseconds // 3_600_000
23
  milliseconds -= hours * 3_600_000
 
24
  minutes = milliseconds // 60_000
25
  milliseconds -= minutes * 60_000
 
26
  seconds = milliseconds // 1_000
27
  milliseconds -= seconds * 1_000
 
28
  hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
29
  return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
30
  else:
 
31
  return seconds
32
 
 
33
  def transcribe(file, task, return_timestamps):
34
  outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
35
  text = outputs["text"]
@@ -42,35 +49,53 @@ def transcribe(file, task, return_timestamps):
42
  text = "\n".join(str(feature) for feature in timestamps)
43
  return text
44
 
45
- # Use Blocks and modern Gradio components
46
- with gr.Blocks() as demo:
47
-
48
- with gr.TabbedInterface(["Transcribe Microphone", "Transcribe Audio File"]) as tabs:
49
-
50
- with gr.TabItem("Transcribe Microphone"):
51
- mic_audio = gr.Audio(source="microphone", type="filepath", label="Record Speech")
52
- task = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
53
- return_timestamps = gr.Checkbox(label="Return timestamps")
54
- mic_output = gr.Textbox(label="Transcription")
55
- mic_button = gr.Button("Transcribe")
56
-
57
- mic_button.click(
58
- fn=transcribe,
59
- inputs=[mic_audio, task, return_timestamps],
60
- outputs=mic_output,
61
- )
62
-
63
- with gr.TabItem("Transcribe Audio File"):
64
- file_audio = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
65
- task = gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
66
- return_timestamps = gr.Checkbox(label="Return timestamps")
67
- file_output = gr.Textbox(label="Transcription")
68
- file_button = gr.Button("Transcribe")
69
-
70
- file_button.click(
71
- fn=transcribe,
72
- inputs=[file_audio, task, return_timestamps],
73
- outputs=file_output,
74
- )
75
-
76
- demo.launch(enable_queue=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  device = 0 if torch.cuda.is_available() else "cpu"
10
 
 
11
  pipe = pipeline(
12
  task="automatic-speech-recognition",
13
  model=MODEL_NAME,
 
15
  device=device,
16
  )
17
 
18
+
19
+ # Copied from https://github.com/openai/whisper/blob/c09a7ae299c4c34c5839a76380ae407e7d785914/whisper/utils.py#L50
20
  def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
21
  if seconds is not None:
22
  milliseconds = round(seconds * 1000.0)
23
+
24
  hours = milliseconds // 3_600_000
25
  milliseconds -= hours * 3_600_000
26
+
27
  minutes = milliseconds // 60_000
28
  milliseconds -= minutes * 60_000
29
+
30
  seconds = milliseconds // 1_000
31
  milliseconds -= seconds * 1_000
32
+
33
  hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
34
  return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
35
  else:
36
+ # we have a malformed timestamp so just return it as is
37
  return seconds
38
 
39
+
40
  def transcribe(file, task, return_timestamps):
41
  outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
42
  text = outputs["text"]
 
49
  text = "\n".join(str(feature) for feature in timestamps)
50
  return text
51
 
52
+
53
+ demo = gr.Blocks()
54
+
55
+ mic_transcribe = gr.Interface(
56
+ fn=transcribe,
57
+ inputs=[
58
+ gr.Audio(source="microphone", type="filepath", optional=True),
59
+ gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
60
+ gr.Checkbox(default=False, label="Return timestamps"),
61
+ ],
62
+ outputs="text",
63
+ layout="horizontal",
64
+ theme="huggingface",
65
+ title="Whisper Demo: Transcribe Audio",
66
+ description=(
67
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
68
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
69
+ " of arbitrary length."
70
+ ),
71
+ allow_flagging="never",
72
+ )
73
+
74
+ file_transcribe = gr.Interface(
75
+ fn=transcribe,
76
+ inputs=[
77
+ gr.Audio(source="upload", optional=True, label="Audio file", type="filepath"),
78
+ gr.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
79
+ gr.Checkbox(default=False, label="Return timestamps"),
80
+ ],
81
+ outputs="text",
82
+ layout="horizontal",
83
+ theme="huggingface",
84
+ title="Whisper Demo: Transcribe Audio",
85
+ description=(
86
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
87
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
88
+ " of arbitrary length."
89
+ ),
90
+ examples=[
91
+ ["./example.flac", "transcribe", False],
92
+ ["./example.flac", "transcribe", True],
93
+ ],
94
+ cache_examples=True,
95
+ allow_flagging="never",
96
+ )
97
+
98
+ with demo:
99
+ gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"])
100
+
101
+ demo.launch(enable_queue=True)