riteshkr commited on
Commit
42de01f
·
verified ·
1 Parent(s): 03acb2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -36
app.py CHANGED
@@ -1,53 +1,54 @@
1
  import torch
2
  from transformers import pipeline
3
- from transformers.pipelines.audio_utils import ffmpeg_read
4
  import gradio as gr
5
 
6
- MODEL_NAME = "riteshkr/quantized-whisper-large-v3"
 
7
  BATCH_SIZE = 8
8
 
 
9
  device = 0 if torch.cuda.is_available() else "cpu"
10
 
 
11
  pipe = pipeline(
12
  task="automatic-speech-recognition",
13
  model=MODEL_NAME,
14
  chunk_length_s=30,
15
  device=device,
16
  )
 
 
17
  def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
18
  if seconds is not None:
19
  milliseconds = round(seconds * 1000.0)
20
-
21
  hours = milliseconds // 3_600_000
22
  milliseconds -= hours * 3_600_000
23
-
24
  minutes = milliseconds // 60_000
25
  milliseconds -= minutes * 60_000
26
-
27
  seconds = milliseconds // 1_000
28
  milliseconds -= seconds * 1_000
29
-
30
  hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
31
  return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
32
  else:
33
  return seconds
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- def transcribe(file, task, return_timestamps):
37
- outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
38
- text = outputs["text"]
39
- if return_timestamps:
40
- timestamps = outputs["chunks"]
41
- timestamps = [
42
- f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
43
- for chunk in timestamps
44
- ]
45
- text = "\n".join(str(feature) for feature in timestamps)
46
- return text
47
-
48
-
49
- demo = gr.Blocks()
50
-
51
  mic_transcribe = gr.Interface(
52
  fn=transcribe,
53
  inputs=[
@@ -57,44 +58,43 @@ mic_transcribe = gr.Interface(
57
  ],
58
  outputs="text",
59
  layout="horizontal",
60
- theme="huggingface",
61
- title="Transcribe Audio",
62
  description=(
63
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
64
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
65
- " of arbitrary length."
66
  ),
67
  allow_flagging="never",
68
  )
69
 
 
70
  file_transcribe = gr.Interface(
71
  fn=transcribe,
72
  inputs=[
73
- gr.Audio(sources="upload", label="Audio file", type="filepath"),
74
  gr.Radio(["transcribe", "translate"], label="Task"),
75
  gr.Checkbox(label="Return timestamps"),
76
  ],
77
  outputs="text",
78
  layout="horizontal",
79
- theme="huggingface",
80
- title="Transcribe Audio",
81
  description=(
82
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
83
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
84
- " of arbitrary length."
85
  ),
 
86
  examples=[
87
  ["./example.flac", "transcribe", False],
88
  ["./example.flac", "transcribe", True],
89
  ],
90
- cache_examples=True,
91
- allow_flagging="never",
92
  )
93
 
 
 
 
94
  with demo:
95
  gr.TabbedInterface(
96
- [mic_transcribe, file_transcribe],
97
  ["Transcribe Microphone", "Transcribe Audio File"]
98
  )
99
 
100
- demo.launch(enable_queue=True)
 
 
 
1
  import torch
2
  from transformers import pipeline
 
3
  import gradio as gr
4
 
5
+ # Define the model details
6
+ MODEL_NAME = "riteshkr/quantized-whisper-large-v3" # Update with your actual model ID
7
  BATCH_SIZE = 8
8
 
9
+ # Select device based on availability of CUDA (GPU) or fallback to CPU
10
  device = 0 if torch.cuda.is_available() else "cpu"
11
 
12
+ # Load the ASR model pipeline
13
  pipe = pipeline(
14
  task="automatic-speech-recognition",
15
  model=MODEL_NAME,
16
  chunk_length_s=30,
17
  device=device,
18
  )
19
+
20
+ # Utility function to format timestamps
21
  def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = "."):
22
  if seconds is not None:
23
  milliseconds = round(seconds * 1000.0)
 
24
  hours = milliseconds // 3_600_000
25
  milliseconds -= hours * 3_600_000
 
26
  minutes = milliseconds // 60_000
27
  milliseconds -= minutes * 60_000
 
28
  seconds = milliseconds // 1_000
29
  milliseconds -= seconds * 1_000
 
30
  hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
31
  return f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
32
  else:
33
  return seconds
34
 
35
+ # Transcription function for batch processing
36
+ def transcribe(files, task, return_timestamps):
37
+ transcriptions = []
38
+ for file in files: # Process each file in the batch
39
+ outputs = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=return_timestamps)
40
+ text = outputs["text"]
41
+ if return_timestamps:
42
+ timestamps = outputs["chunks"]
43
+ formatted_chunks = [
44
+ f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
45
+ for chunk in timestamps
46
+ ]
47
+ text = "\n".join(formatted_chunks)
48
+ transcriptions.append(text)
49
+ return "\n\n".join(transcriptions) # Return all transcriptions combined
50
 
51
+ # Define Gradio interface for microphone input
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  mic_transcribe = gr.Interface(
53
  fn=transcribe,
54
  inputs=[
 
58
  ],
59
  outputs="text",
60
  layout="horizontal",
61
+ title="Whisper Demo: Transcribe Audio",
 
62
  description=(
63
+ f"Transcribe long-form microphone inputs with the {MODEL_NAME} model. Supports transcription and translation."
 
 
64
  ),
65
  allow_flagging="never",
66
  )
67
 
68
+ # Define Gradio interface for file upload
69
  file_transcribe = gr.Interface(
70
  fn=transcribe,
71
  inputs=[
72
+ gr.Audio(sources="upload", type="filepath", label="Upload Audio File"),
73
  gr.Radio(["transcribe", "translate"], label="Task"),
74
  gr.Checkbox(label="Return timestamps"),
75
  ],
76
  outputs="text",
77
  layout="horizontal",
78
+ title="Whisper Demo: Transcribe Audio",
 
79
  description=(
80
+ f"Upload audio files to transcribe or translate them using the {MODEL_NAME} model."
 
 
81
  ),
82
+ allow_flagging="never",
83
  examples=[
84
  ["./example.flac", "transcribe", False],
85
  ["./example.flac", "transcribe", True],
86
  ],
 
 
87
  )
88
 
89
+ # Create the Gradio tabbed interface for switching between modes
90
+ demo = gr.Blocks()
91
+
92
  with demo:
93
  gr.TabbedInterface(
94
+ [mic_transcribe, file_transcribe],
95
  ["Transcribe Microphone", "Transcribe Audio File"]
96
  )
97
 
98
+ # Launch the app
99
+ if __name__ == "__main__":
100
+ demo.launch(debug=True, enable_queue=True, share=True)